1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816     .lowerFor({{S32, S64}})
817     .lowerIf(typeIs(1, S1))
818     .customFor({{S64, S64}});
819   if (ST.has16BitInsts())
820     IToFP.legalFor({{S16, S16}});
821   IToFP.clampScalar(1, S32, S64)
822        .minScalar(0, S32)
823        .scalarize(0)
824        .widenScalarToNextPow2(1);
825 
826   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
827     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828     .customFor({{S64, S32}, {S64, S64}})
829     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
830   if (ST.has16BitInsts())
831     FPToI.legalFor({{S16, S16}});
832   else
833     FPToI.minScalar(1, S32);
834 
835   FPToI.minScalar(0, S32)
836        .widenScalarToNextPow2(0, 32)
837        .scalarize(0)
838        .lower();
839 
840   // Lower roundeven into G_FRINT
841   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842     .scalarize(0)
843     .lower();
844 
845   if (ST.has16BitInsts()) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S16, S32, S64})
848       .clampScalar(0, S16, S64)
849       .scalarize(0);
850   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32, S64})
853       .clampScalar(0, S32, S64)
854       .scalarize(0);
855   } else {
856     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
857       .legalFor({S32})
858       .customFor({S64})
859       .clampScalar(0, S32, S64)
860       .scalarize(0);
861   }
862 
863   getActionDefinitionsBuilder(G_PTR_ADD)
864     .legalIf(all(isPointer(0), sameSize(0, 1)))
865     .scalarize(0)
866     .scalarSameSizeAs(1, 0);
867 
868   getActionDefinitionsBuilder(G_PTRMASK)
869     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870     .scalarSameSizeAs(1, 0)
871     .scalarize(0);
872 
873   auto &CmpBuilder =
874     getActionDefinitionsBuilder(G_ICMP)
875     // The compare output type differs based on the register bank of the output,
876     // so make both s1 and s32 legal.
877     //
878     // Scalar compares producing output in scc will be promoted to s32, as that
879     // is the allocatable register type that will be needed for the copy from
880     // scc. This will be promoted during RegBankSelect, and we assume something
881     // before that won't try to use s32 result types.
882     //
883     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884     // bank.
885     .legalForCartesianProduct(
886       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887     .legalForCartesianProduct(
888       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
889   if (ST.has16BitInsts()) {
890     CmpBuilder.legalFor({{S1, S16}});
891   }
892 
893   CmpBuilder
894     .widenScalarToNextPow2(1)
895     .clampScalar(1, S32, S64)
896     .scalarize(0)
897     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898 
899   getActionDefinitionsBuilder(G_FCMP)
900     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0);
904 
905   // FIXME: fpow has a selection pattern that should move to custom lowering.
906   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
907   if (ST.has16BitInsts())
908     Exp2Ops.legalFor({S32, S16});
909   else
910     Exp2Ops.legalFor({S32});
911   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
912   Exp2Ops.scalarize(0);
913 
914   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
915   if (ST.has16BitInsts())
916     ExpOps.customFor({{S32}, {S16}});
917   else
918     ExpOps.customFor({S32});
919   ExpOps.clampScalar(0, MinScalarFPTy, S32)
920         .scalarize(0);
921 
922   getActionDefinitionsBuilder(G_FPOWI)
923     .clampScalar(0, MinScalarFPTy, S32)
924     .lower();
925 
926   // The 64-bit versions produce 32-bit results, but only on the SALU.
927   getActionDefinitionsBuilder(G_CTPOP)
928     .legalFor({{S32, S32}, {S32, S64}})
929     .clampScalar(0, S32, S32)
930     .clampScalar(1, S32, S64)
931     .scalarize(0)
932     .widenScalarToNextPow2(0, 32)
933     .widenScalarToNextPow2(1, 32);
934 
935   // The hardware instructions return a different result on 0 than the generic
936   // instructions expect. The hardware produces -1, but these produce the
937   // bitwidth.
938   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
939     .scalarize(0)
940     .clampScalar(0, S32, S32)
941     .clampScalar(1, S32, S64)
942     .widenScalarToNextPow2(0, 32)
943     .widenScalarToNextPow2(1, 32)
944     .lower();
945 
946   // The 64-bit versions produce 32-bit results, but only on the SALU.
947   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
948     .legalFor({{S32, S32}, {S32, S64}})
949     .clampScalar(0, S32, S32)
950     .clampScalar(1, S32, S64)
951     .scalarize(0)
952     .widenScalarToNextPow2(0, 32)
953     .widenScalarToNextPow2(1, 32);
954 
955   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956   // RegBankSelect.
957   getActionDefinitionsBuilder(G_BITREVERSE)
958     .legalFor({S32, S64})
959     .clampScalar(0, S32, S64)
960     .scalarize(0)
961     .widenScalarToNextPow2(0);
962 
963   if (ST.has16BitInsts()) {
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S16, S32, V2S16})
966       .clampMaxNumElements(0, S16, 2)
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .clampScalar(0, S16, S32)
971       .scalarize(0);
972 
973     if (ST.hasVOP3PInsts()) {
974       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975         .legalFor({S32, S16, V2S16})
976         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
977         .clampMaxNumElements(0, S16, 2)
978         .minScalar(0, S16)
979         .widenScalarToNextPow2(0)
980         .scalarize(0)
981         .lower();
982     } else {
983       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
984         .legalFor({S32, S16})
985         .widenScalarToNextPow2(0)
986         .minScalar(0, S16)
987         .scalarize(0)
988         .lower();
989     }
990   } else {
991     // TODO: Should have same legality without v_perm_b32
992     getActionDefinitionsBuilder(G_BSWAP)
993       .legalFor({S32})
994       .lowerIf(scalarNarrowerThan(0, 32))
995       // FIXME: Fixing non-power-of-2 before clamp is workaround for
996       // narrowScalar limitation.
997       .widenScalarToNextPow2(0)
998       .maxScalar(0, S32)
999       .scalarize(0)
1000       .lower();
1001 
1002     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1003       .legalFor({S32})
1004       .minScalar(0, S32)
1005       .widenScalarToNextPow2(0)
1006       .scalarize(0)
1007       .lower();
1008   }
1009 
1010   getActionDefinitionsBuilder(G_INTTOPTR)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(1, 0),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020       })
1021     .narrowScalarIf(largerThan(1, 0),
1022       [](const LegalityQuery &Query) {
1023         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1024       });
1025 
1026   getActionDefinitionsBuilder(G_PTRTOINT)
1027     // List the common cases
1028     .legalForCartesianProduct(AddrSpaces64, {S64})
1029     .legalForCartesianProduct(AddrSpaces32, {S32})
1030     .scalarize(0)
1031     // Accept any address space as long as the size matches
1032     .legalIf(sameSize(0, 1))
1033     .widenScalarIf(smallerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       })
1037     .narrowScalarIf(
1038       largerThan(0, 1),
1039       [](const LegalityQuery &Query) {
1040         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1041       });
1042 
1043   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1044     .scalarize(0)
1045     .custom();
1046 
1047   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1048                                     bool IsLoad) -> bool {
1049     const LLT DstTy = Query.Types[0];
1050 
1051     // Split vector extloads.
1052     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054 
1055     if (MemSize < DstTy.getSizeInBits())
1056       MemSize = std::max(MemSize, AlignBits);
1057 
1058     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1059       return true;
1060 
1061     const LLT PtrTy = Query.Types[1];
1062     unsigned AS = PtrTy.getAddressSpace();
1063     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1064       return true;
1065 
1066     // Catch weird sized loads that don't evenly divide into the access sizes
1067     // TODO: May be able to widen depending on alignment etc.
1068     unsigned NumRegs = (MemSize + 31) / 32;
1069     if (NumRegs == 3) {
1070       if (!ST.hasDwordx3LoadStores())
1071         return true;
1072     } else {
1073       // If the alignment allows, these should have been widened.
1074       if (!isPowerOf2_32(NumRegs))
1075         return true;
1076     }
1077 
1078     if (AlignBits < MemSize) {
1079       const SITargetLowering *TLI = ST.getTargetLowering();
1080       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081                                                       Align(AlignBits / 8));
1082     }
1083 
1084     return false;
1085   };
1086 
1087   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 
1091   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092   // LDS
1093   // TODO: Unsupported flat for SI.
1094 
1095   for (unsigned Op : {G_LOAD, G_STORE}) {
1096     const bool IsStore = Op == G_STORE;
1097 
1098     auto &Actions = getActionDefinitionsBuilder(Op);
1099     // Explicitly list some common cases.
1100     // TODO: Does this help compile time at all?
1101     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104                                       {S64, GlobalPtr, S64, GlobalAlign32},
1105                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107                                       {S32, GlobalPtr, S8, GlobalAlign8},
1108                                       {S32, GlobalPtr, S16, GlobalAlign16},
1109 
1110                                       {S32, LocalPtr, S32, 32},
1111                                       {S64, LocalPtr, S64, 32},
1112                                       {V2S32, LocalPtr, V2S32, 32},
1113                                       {S32, LocalPtr, S8, 8},
1114                                       {S32, LocalPtr, S16, 16},
1115                                       {V2S16, LocalPtr, S32, 32},
1116 
1117                                       {S32, PrivatePtr, S32, 32},
1118                                       {S32, PrivatePtr, S8, 8},
1119                                       {S32, PrivatePtr, S16, 16},
1120                                       {V2S16, PrivatePtr, S32, 32},
1121 
1122                                       {S32, ConstantPtr, S32, GlobalAlign32},
1123                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125                                       {S64, ConstantPtr, S64, GlobalAlign32},
1126                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1127     Actions.legalIf(
1128       [=](const LegalityQuery &Query) -> bool {
1129         return isLoadStoreLegal(ST, Query);
1130       });
1131 
1132     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1133     // 64-bits.
1134     //
1135     // TODO: Should generalize bitcast action into coerce, which will also cover
1136     // inserting addrspacecasts.
1137     Actions.customIf(typeIs(1, Constant32Ptr));
1138 
1139     // Turn any illegal element vectors into something easier to deal
1140     // with. These will ultimately produce 32-bit scalar shifts to extract the
1141     // parts anyway.
1142     //
1143     // For odd 16-bit element vectors, prefer to split those into pieces with
1144     // 16-bit vector parts.
1145     Actions.bitcastIf(
1146       [=](const LegalityQuery &Query) -> bool {
1147         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148                                           Query.MMODescrs[0].MemoryTy);
1149       }, bitcastToRegisterType(0));
1150 
1151     if (!IsStore) {
1152       // Widen suitably aligned loads by loading extra bytes. The standard
1153       // legalization actions can't properly express widening memory operands.
1154       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155         return shouldWidenLoad(ST, Query, G_LOAD);
1156       });
1157     }
1158 
1159     // FIXME: load/store narrowing should be moved to lower action
1160     Actions
1161         .narrowScalarIf(
1162             [=](const LegalityQuery &Query) -> bool {
1163               return !Query.Types[0].isVector() &&
1164                      needToSplitMemOp(Query, Op == G_LOAD);
1165             },
1166             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1167               const LLT DstTy = Query.Types[0];
1168               const LLT PtrTy = Query.Types[1];
1169 
1170               const unsigned DstSize = DstTy.getSizeInBits();
1171               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172 
1173               // Split extloads.
1174               if (DstSize > MemSize)
1175                 return std::make_pair(0, LLT::scalar(MemSize));
1176 
1177               if (!isPowerOf2_32(DstSize)) {
1178                 // We're probably decomposing an odd sized store. Try to split
1179                 // to the widest type. TODO: Account for alignment. As-is it
1180                 // should be OK, since the new parts will be further legalized.
1181                 unsigned FloorSize = PowerOf2Floor(DstSize);
1182                 return std::make_pair(0, LLT::scalar(FloorSize));
1183               }
1184 
1185               if (DstSize > 32 && (DstSize % 32 != 0)) {
1186                 // FIXME: Need a way to specify non-extload of larger size if
1187                 // suitably aligned.
1188                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1189               }
1190 
1191               unsigned MaxSize = maxSizeForAddrSpace(ST,
1192                                                      PtrTy.getAddressSpace(),
1193                                                      Op == G_LOAD);
1194               if (MemSize > MaxSize)
1195                 return std::make_pair(0, LLT::scalar(MaxSize));
1196 
1197               unsigned Align = Query.MMODescrs[0].AlignInBits;
1198               return std::make_pair(0, LLT::scalar(Align));
1199             })
1200         .fewerElementsIf(
1201             [=](const LegalityQuery &Query) -> bool {
1202               return Query.Types[0].isVector() &&
1203                      needToSplitMemOp(Query, Op == G_LOAD);
1204             },
1205             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1206               const LLT DstTy = Query.Types[0];
1207               const LLT PtrTy = Query.Types[1];
1208 
1209               LLT EltTy = DstTy.getElementType();
1210               unsigned MaxSize = maxSizeForAddrSpace(ST,
1211                                                      PtrTy.getAddressSpace(),
1212                                                      Op == G_LOAD);
1213 
1214               // FIXME: Handle widened to power of 2 results better. This ends
1215               // up scalarizing.
1216               // FIXME: 3 element stores scalarized on SI
1217 
1218               // Split if it's too large for the address space.
1219               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220               if (MemSize > MaxSize) {
1221                 unsigned NumElts = DstTy.getNumElements();
1222                 unsigned EltSize = EltTy.getSizeInBits();
1223 
1224                 if (MaxSize % EltSize == 0) {
1225                   return std::make_pair(
1226                       0, LLT::scalarOrVector(
1227                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1228                 }
1229 
1230                 unsigned NumPieces = MemSize / MaxSize;
1231 
1232                 // FIXME: Refine when odd breakdowns handled
1233                 // The scalars will need to be re-legalized.
1234                 if (NumPieces == 1 || NumPieces >= NumElts ||
1235                     NumElts % NumPieces != 0)
1236                   return std::make_pair(0, EltTy);
1237 
1238                 return std::make_pair(
1239                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1240               }
1241 
1242               // FIXME: We could probably handle weird extending loads better.
1243               if (DstTy.getSizeInBits() > MemSize)
1244                 return std::make_pair(0, EltTy);
1245 
1246               unsigned EltSize = EltTy.getSizeInBits();
1247               unsigned DstSize = DstTy.getSizeInBits();
1248               if (!isPowerOf2_32(DstSize)) {
1249                 // We're probably decomposing an odd sized store. Try to split
1250                 // to the widest type. TODO: Account for alignment. As-is it
1251                 // should be OK, since the new parts will be further legalized.
1252                 unsigned FloorSize = PowerOf2Floor(DstSize);
1253                 return std::make_pair(
1254                     0, LLT::scalarOrVector(
1255                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1256               }
1257 
1258               // Need to split because of alignment.
1259               unsigned Align = Query.MMODescrs[0].AlignInBits;
1260               if (EltSize > Align &&
1261                   (EltSize / Align < DstTy.getNumElements())) {
1262                 return std::make_pair(
1263                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1264               }
1265 
1266               // May need relegalization for the scalars.
1267               return std::make_pair(0, EltTy);
1268             })
1269     .lowerIfMemSizeNotPow2()
1270     .minScalar(0, S32)
1271     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1272     .widenScalarToNextPow2(0)
1273     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1274     .lower();
1275   }
1276 
1277   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1278                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1279                                                   {S32, GlobalPtr, S16, 2 * 8},
1280                                                   {S32, LocalPtr, S8, 8},
1281                                                   {S32, LocalPtr, S16, 16},
1282                                                   {S32, PrivatePtr, S8, 8},
1283                                                   {S32, PrivatePtr, S16, 16},
1284                                                   {S32, ConstantPtr, S8, 8},
1285                                                   {S32, ConstantPtr, S16, 2 * 8}})
1286                        .legalIf(
1287                          [=](const LegalityQuery &Query) -> bool {
1288                            return isLoadStoreLegal(ST, Query);
1289                          });
1290 
1291   if (ST.hasFlatAddressSpace()) {
1292     ExtLoads.legalForTypesWithMemDesc(
1293         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1294   }
1295 
1296   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1297   // 64-bits.
1298   //
1299   // TODO: Should generalize bitcast action into coerce, which will also cover
1300   // inserting addrspacecasts.
1301   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1302 
1303   ExtLoads.clampScalar(0, S32, S32)
1304           .widenScalarToNextPow2(0)
1305           .unsupportedIfMemSizeNotPow2()
1306           .lower();
1307 
1308   auto &Atomics = getActionDefinitionsBuilder(
1309     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1310      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1311      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1312      G_ATOMICRMW_UMIN})
1313     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1314                {S64, GlobalPtr}, {S64, LocalPtr},
1315                {S32, RegionPtr}, {S64, RegionPtr}});
1316   if (ST.hasFlatAddressSpace()) {
1317     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1318   }
1319 
1320   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1321   if (ST.hasLDSFPAtomics()) {
1322     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1323     if (ST.hasGFX90AInsts())
1324       Atomic.legalFor({{S64, LocalPtr}});
1325   }
1326   if (ST.hasAtomicFaddInsts())
1327     Atomic.legalFor({{S32, GlobalPtr}});
1328 
1329   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1330   // demarshalling
1331   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1332     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1333                 {S32, FlatPtr}, {S64, FlatPtr}})
1334     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1335                {S32, RegionPtr}, {S64, RegionPtr}});
1336   // TODO: Pointer types, any 32-bit or 64-bit vector
1337 
1338   // Condition should be s32 for scalar, s1 for vector.
1339   getActionDefinitionsBuilder(G_SELECT)
1340       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1341                                  LocalPtr, FlatPtr, PrivatePtr,
1342                                  LLT::fixed_vector(2, LocalPtr),
1343                                  LLT::fixed_vector(2, PrivatePtr)},
1344                                 {S1, S32})
1345       .clampScalar(0, S16, S64)
1346       .scalarize(1)
1347       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1348       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1349       .clampMaxNumElements(0, S32, 2)
1350       .clampMaxNumElements(0, LocalPtr, 2)
1351       .clampMaxNumElements(0, PrivatePtr, 2)
1352       .scalarize(0)
1353       .widenScalarToNextPow2(0)
1354       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1355 
1356   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1357   // be more flexible with the shift amount type.
1358   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1359     .legalFor({{S32, S32}, {S64, S32}});
1360   if (ST.has16BitInsts()) {
1361     if (ST.hasVOP3PInsts()) {
1362       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1363             .clampMaxNumElements(0, S16, 2);
1364     } else
1365       Shifts.legalFor({{S16, S16}});
1366 
1367     // TODO: Support 16-bit shift amounts for all types
1368     Shifts.widenScalarIf(
1369       [=](const LegalityQuery &Query) {
1370         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1371         // 32-bit amount.
1372         const LLT ValTy = Query.Types[0];
1373         const LLT AmountTy = Query.Types[1];
1374         return ValTy.getSizeInBits() <= 16 &&
1375                AmountTy.getSizeInBits() < 16;
1376       }, changeTo(1, S16));
1377     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1378     Shifts.clampScalar(1, S32, S32);
1379     Shifts.clampScalar(0, S16, S64);
1380     Shifts.widenScalarToNextPow2(0, 16);
1381 
1382     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1383       .minScalar(0, S16)
1384       .scalarize(0)
1385       .lower();
1386   } else {
1387     // Make sure we legalize the shift amount type first, as the general
1388     // expansion for the shifted type will produce much worse code if it hasn't
1389     // been truncated already.
1390     Shifts.clampScalar(1, S32, S32);
1391     Shifts.clampScalar(0, S32, S64);
1392     Shifts.widenScalarToNextPow2(0, 32);
1393 
1394     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1395       .minScalar(0, S32)
1396       .scalarize(0)
1397       .lower();
1398   }
1399   Shifts.scalarize(0);
1400 
1401   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1402     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1403     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1404     unsigned IdxTypeIdx = 2;
1405 
1406     getActionDefinitionsBuilder(Op)
1407       .customIf([=](const LegalityQuery &Query) {
1408           const LLT EltTy = Query.Types[EltTypeIdx];
1409           const LLT VecTy = Query.Types[VecTypeIdx];
1410           const LLT IdxTy = Query.Types[IdxTypeIdx];
1411           const unsigned EltSize = EltTy.getSizeInBits();
1412           return (EltSize == 32 || EltSize == 64) &&
1413                   VecTy.getSizeInBits() % 32 == 0 &&
1414                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1415                   IdxTy.getSizeInBits() == 32;
1416         })
1417       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1418                  bitcastToVectorElement32(VecTypeIdx))
1419       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1420       .bitcastIf(
1421         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1422         [=](const LegalityQuery &Query) {
1423           // For > 64-bit element types, try to turn this into a 64-bit
1424           // element vector since we may be able to do better indexing
1425           // if this is scalar. If not, fall back to 32.
1426           const LLT EltTy = Query.Types[EltTypeIdx];
1427           const LLT VecTy = Query.Types[VecTypeIdx];
1428           const unsigned DstEltSize = EltTy.getSizeInBits();
1429           const unsigned VecSize = VecTy.getSizeInBits();
1430 
1431           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1432           return std::make_pair(
1433               VecTypeIdx,
1434               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1435         })
1436       .clampScalar(EltTypeIdx, S32, S64)
1437       .clampScalar(VecTypeIdx, S32, S64)
1438       .clampScalar(IdxTypeIdx, S32, S32)
1439       .clampMaxNumElements(VecTypeIdx, S32, 32)
1440       // TODO: Clamp elements for 64-bit vectors?
1441       // It should only be necessary with variable indexes.
1442       // As a last resort, lower to the stack
1443       .lower();
1444   }
1445 
1446   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1447     .unsupportedIf([=](const LegalityQuery &Query) {
1448         const LLT &EltTy = Query.Types[1].getElementType();
1449         return Query.Types[0] != EltTy;
1450       });
1451 
1452   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1453     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1454     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1455 
1456     // FIXME: Doesn't handle extract of illegal sizes.
1457     getActionDefinitionsBuilder(Op)
1458       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1459       // FIXME: Multiples of 16 should not be legal.
1460       .legalIf([=](const LegalityQuery &Query) {
1461           const LLT BigTy = Query.Types[BigTyIdx];
1462           const LLT LitTy = Query.Types[LitTyIdx];
1463           return (BigTy.getSizeInBits() % 32 == 0) &&
1464                  (LitTy.getSizeInBits() % 16 == 0);
1465         })
1466       .widenScalarIf(
1467         [=](const LegalityQuery &Query) {
1468           const LLT BigTy = Query.Types[BigTyIdx];
1469           return (BigTy.getScalarSizeInBits() < 16);
1470         },
1471         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1472       .widenScalarIf(
1473         [=](const LegalityQuery &Query) {
1474           const LLT LitTy = Query.Types[LitTyIdx];
1475           return (LitTy.getScalarSizeInBits() < 16);
1476         },
1477         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1478       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1479       .widenScalarToNextPow2(BigTyIdx, 32);
1480 
1481   }
1482 
1483   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1484     .legalForCartesianProduct(AllS32Vectors, {S32})
1485     .legalForCartesianProduct(AllS64Vectors, {S64})
1486     .clampNumElements(0, V16S32, V32S32)
1487     .clampNumElements(0, V2S64, V16S64)
1488     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1489 
1490   if (ST.hasScalarPackInsts()) {
1491     BuildVector
1492       // FIXME: Should probably widen s1 vectors straight to s32
1493       .minScalarOrElt(0, S16)
1494       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1495       .minScalar(1, S32);
1496 
1497     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1498       .legalFor({V2S16, S32})
1499       .lower();
1500     BuildVector.minScalarOrElt(0, S32);
1501   } else {
1502     BuildVector.customFor({V2S16, S16});
1503     BuildVector.minScalarOrElt(0, S32);
1504 
1505     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1506       .customFor({V2S16, S32})
1507       .lower();
1508   }
1509 
1510   BuildVector.legalIf(isRegisterType(0));
1511 
1512   // FIXME: Clamp maximum size
1513   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1514     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1515     .clampMaxNumElements(0, S32, 32)
1516     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1517     .clampMaxNumElements(0, S16, 64);
1518 
1519   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1520   // pre-legalize.
1521   if (ST.hasVOP3PInsts()) {
1522     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1523       .customFor({V2S16, V2S16})
1524       .lower();
1525   } else
1526     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1527 
1528   // Merge/Unmerge
1529   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1530     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1531     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1532 
1533     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1534       const LLT Ty = Query.Types[TypeIdx];
1535       if (Ty.isVector()) {
1536         const LLT &EltTy = Ty.getElementType();
1537         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1538           return true;
1539         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1540           return true;
1541       }
1542       return false;
1543     };
1544 
1545     auto &Builder = getActionDefinitionsBuilder(Op)
1546       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1547       .lowerFor({{S16, V2S16}})
1548       .lowerIf([=](const LegalityQuery &Query) {
1549           const LLT BigTy = Query.Types[BigTyIdx];
1550           return BigTy.getSizeInBits() == 32;
1551         })
1552       // Try to widen to s16 first for small types.
1553       // TODO: Only do this on targets with legal s16 shifts
1554       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1555       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1556       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1557       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1558                            elementTypeIs(1, S16)),
1559                        changeTo(1, V2S16))
1560       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1561       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1562       // valid.
1563       .clampScalar(LitTyIdx, S32, S512)
1564       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1565       // Break up vectors with weird elements into scalars
1566       .fewerElementsIf(
1567         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1568         scalarize(0))
1569       .fewerElementsIf(
1570         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1571         scalarize(1))
1572       .clampScalar(BigTyIdx, S32, MaxScalar);
1573 
1574     if (Op == G_MERGE_VALUES) {
1575       Builder.widenScalarIf(
1576         // TODO: Use 16-bit shifts if legal for 8-bit values?
1577         [=](const LegalityQuery &Query) {
1578           const LLT Ty = Query.Types[LitTyIdx];
1579           return Ty.getSizeInBits() < 32;
1580         },
1581         changeTo(LitTyIdx, S32));
1582     }
1583 
1584     Builder.widenScalarIf(
1585       [=](const LegalityQuery &Query) {
1586         const LLT Ty = Query.Types[BigTyIdx];
1587         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1588           Ty.getSizeInBits() % 16 != 0;
1589       },
1590       [=](const LegalityQuery &Query) {
1591         // Pick the next power of 2, or a multiple of 64 over 128.
1592         // Whichever is smaller.
1593         const LLT &Ty = Query.Types[BigTyIdx];
1594         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1595         if (NewSizeInBits >= 256) {
1596           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1597           if (RoundedTo < NewSizeInBits)
1598             NewSizeInBits = RoundedTo;
1599         }
1600         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1601       })
1602       // Any vectors left are the wrong size. Scalarize them.
1603       .scalarize(0)
1604       .scalarize(1);
1605   }
1606 
1607   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1608   // RegBankSelect.
1609   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1610     .legalFor({{S32}, {S64}});
1611 
1612   if (ST.hasVOP3PInsts()) {
1613     SextInReg.lowerFor({{V2S16}})
1614       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1615       // get more vector shift opportunities, since we'll get those when
1616       // expanded.
1617       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1618   } else if (ST.has16BitInsts()) {
1619     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1620   } else {
1621     // Prefer to promote to s32 before lowering if we don't have 16-bit
1622     // shifts. This avoid a lot of intermediate truncate and extend operations.
1623     SextInReg.lowerFor({{S32}, {S64}});
1624   }
1625 
1626   SextInReg
1627     .scalarize(0)
1628     .clampScalar(0, S32, S64)
1629     .lower();
1630 
1631   // TODO: Only Try to form v2s16 with legal packed instructions.
1632   getActionDefinitionsBuilder(G_FSHR)
1633     .legalFor({{S32, S32}})
1634     .lowerFor({{V2S16, V2S16}})
1635     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1636     .scalarize(0)
1637     .lower();
1638 
1639   if (ST.hasVOP3PInsts()) {
1640     getActionDefinitionsBuilder(G_FSHL)
1641       .lowerFor({{V2S16, V2S16}})
1642       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1643       .scalarize(0)
1644       .lower();
1645   } else {
1646     getActionDefinitionsBuilder(G_FSHL)
1647       .scalarize(0)
1648       .lower();
1649   }
1650 
1651   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652     .legalFor({S64});
1653 
1654   getActionDefinitionsBuilder(G_FENCE)
1655     .alwaysLegal();
1656 
1657   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658       .scalarize(0)
1659       .minScalar(0, S32)
1660       .lower();
1661 
1662   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663       .legalFor({{S32, S32}, {S64, S32}})
1664       .clampScalar(1, S32, S32)
1665       .clampScalar(0, S32, S64)
1666       .widenScalarToNextPow2(0)
1667       .scalarize(0);
1668 
1669   getActionDefinitionsBuilder({
1670       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671       G_FCOPYSIGN,
1672 
1673       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674       G_ATOMICRMW_NAND,
1675       G_ATOMICRMW_FSUB,
1676       G_READ_REGISTER,
1677       G_WRITE_REGISTER,
1678 
1679       G_SADDO, G_SSUBO,
1680 
1681        // TODO: Implement
1682       G_FMINIMUM, G_FMAXIMUM}).lower();
1683 
1684   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1685         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1686         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1687     .unsupported();
1688 
1689   getLegacyLegalizerInfo().computeTables();
1690   verify(*ST.getInstrInfo());
1691 }
1692 
1693 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1694                                          MachineInstr &MI) const {
1695   MachineIRBuilder &B = Helper.MIRBuilder;
1696   MachineRegisterInfo &MRI = *B.getMRI();
1697 
1698   switch (MI.getOpcode()) {
1699   case TargetOpcode::G_ADDRSPACE_CAST:
1700     return legalizeAddrSpaceCast(MI, MRI, B);
1701   case TargetOpcode::G_FRINT:
1702     return legalizeFrint(MI, MRI, B);
1703   case TargetOpcode::G_FCEIL:
1704     return legalizeFceil(MI, MRI, B);
1705   case TargetOpcode::G_FREM:
1706     return legalizeFrem(MI, MRI, B);
1707   case TargetOpcode::G_INTRINSIC_TRUNC:
1708     return legalizeIntrinsicTrunc(MI, MRI, B);
1709   case TargetOpcode::G_SITOFP:
1710     return legalizeITOFP(MI, MRI, B, true);
1711   case TargetOpcode::G_UITOFP:
1712     return legalizeITOFP(MI, MRI, B, false);
1713   case TargetOpcode::G_FPTOSI:
1714     return legalizeFPTOI(MI, MRI, B, true);
1715   case TargetOpcode::G_FPTOUI:
1716     return legalizeFPTOI(MI, MRI, B, false);
1717   case TargetOpcode::G_FMINNUM:
1718   case TargetOpcode::G_FMAXNUM:
1719   case TargetOpcode::G_FMINNUM_IEEE:
1720   case TargetOpcode::G_FMAXNUM_IEEE:
1721     return legalizeMinNumMaxNum(Helper, MI);
1722   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1723     return legalizeExtractVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_INSERT_VECTOR_ELT:
1725     return legalizeInsertVectorElt(MI, MRI, B);
1726   case TargetOpcode::G_SHUFFLE_VECTOR:
1727     return legalizeShuffleVector(MI, MRI, B);
1728   case TargetOpcode::G_FSIN:
1729   case TargetOpcode::G_FCOS:
1730     return legalizeSinCos(MI, MRI, B);
1731   case TargetOpcode::G_GLOBAL_VALUE:
1732     return legalizeGlobalValue(MI, MRI, B);
1733   case TargetOpcode::G_LOAD:
1734   case TargetOpcode::G_SEXTLOAD:
1735   case TargetOpcode::G_ZEXTLOAD:
1736     return legalizeLoad(Helper, MI);
1737   case TargetOpcode::G_FMAD:
1738     return legalizeFMad(MI, MRI, B);
1739   case TargetOpcode::G_FDIV:
1740     return legalizeFDIV(MI, MRI, B);
1741   case TargetOpcode::G_UDIV:
1742   case TargetOpcode::G_UREM:
1743   case TargetOpcode::G_UDIVREM:
1744     return legalizeUnsignedDIV_REM(MI, MRI, B);
1745   case TargetOpcode::G_SDIV:
1746   case TargetOpcode::G_SREM:
1747   case TargetOpcode::G_SDIVREM:
1748     return legalizeSignedDIV_REM(MI, MRI, B);
1749   case TargetOpcode::G_ATOMIC_CMPXCHG:
1750     return legalizeAtomicCmpXChg(MI, MRI, B);
1751   case TargetOpcode::G_FLOG:
1752     return legalizeFlog(MI, B, numbers::ln2f);
1753   case TargetOpcode::G_FLOG10:
1754     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1755   case TargetOpcode::G_FEXP:
1756     return legalizeFExp(MI, B);
1757   case TargetOpcode::G_FPOW:
1758     return legalizeFPow(MI, B);
1759   case TargetOpcode::G_FFLOOR:
1760     return legalizeFFloor(MI, MRI, B);
1761   case TargetOpcode::G_BUILD_VECTOR:
1762     return legalizeBuildVector(MI, MRI, B);
1763   default:
1764     return false;
1765   }
1766 
1767   llvm_unreachable("expected switch to return");
1768 }
1769 
1770 Register AMDGPULegalizerInfo::getSegmentAperture(
1771   unsigned AS,
1772   MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B) const {
1774   MachineFunction &MF = B.getMF();
1775   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1776   const LLT S32 = LLT::scalar(32);
1777 
1778   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1779 
1780   if (ST.hasApertureRegs()) {
1781     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1782     // getreg.
1783     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1784         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1785         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1786     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1787         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1789     unsigned Encoding =
1790         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1791         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1792         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1793 
1794     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1795 
1796     B.buildInstr(AMDGPU::S_GETREG_B32)
1797       .addDef(GetReg)
1798       .addImm(Encoding);
1799     MRI.setType(GetReg, S32);
1800 
1801     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1802     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1803   }
1804 
1805   Register QueuePtr = MRI.createGenericVirtualRegister(
1806     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1807 
1808   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1809     return Register();
1810 
1811   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1812   // private_segment_aperture_base_hi.
1813   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1814 
1815   // TODO: can we be smarter about machine pointer info?
1816   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1817   MachineMemOperand *MMO = MF.getMachineMemOperand(
1818       PtrInfo,
1819       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1820           MachineMemOperand::MOInvariant,
1821       4, commonAlignment(Align(64), StructOffset));
1822 
1823   Register LoadAddr;
1824 
1825   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1826   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1827 }
1828 
1829 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1830   MachineInstr &MI, MachineRegisterInfo &MRI,
1831   MachineIRBuilder &B) const {
1832   MachineFunction &MF = B.getMF();
1833 
1834   const LLT S32 = LLT::scalar(32);
1835   Register Dst = MI.getOperand(0).getReg();
1836   Register Src = MI.getOperand(1).getReg();
1837 
1838   LLT DstTy = MRI.getType(Dst);
1839   LLT SrcTy = MRI.getType(Src);
1840   unsigned DestAS = DstTy.getAddressSpace();
1841   unsigned SrcAS = SrcTy.getAddressSpace();
1842 
1843   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1844   // vector element.
1845   assert(!DstTy.isVector());
1846 
1847   const AMDGPUTargetMachine &TM
1848     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1849 
1850   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1851     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1852     return true;
1853   }
1854 
1855   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1856     // Truncate.
1857     B.buildExtract(Dst, Src, 0);
1858     MI.eraseFromParent();
1859     return true;
1860   }
1861 
1862   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1864     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1865 
1866     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1867     // another. Merge operands are required to be the same type, but creating an
1868     // extra ptrtoint would be kind of pointless.
1869     auto HighAddr = B.buildConstant(
1870       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1871     B.buildMerge(Dst, {Src, HighAddr});
1872     MI.eraseFromParent();
1873     return true;
1874   }
1875 
1876   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1877     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1878            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1879     unsigned NullVal = TM.getNullPointerValue(DestAS);
1880 
1881     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1882     auto FlatNull = B.buildConstant(SrcTy, 0);
1883 
1884     // Extract low 32-bits of the pointer.
1885     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1886 
1887     auto CmpRes =
1888         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1889     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1890 
1891     MI.eraseFromParent();
1892     return true;
1893   }
1894 
1895   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1896     return false;
1897 
1898   if (!ST.hasFlatAddressSpace())
1899     return false;
1900 
1901   auto SegmentNull =
1902       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1903   auto FlatNull =
1904       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1905 
1906   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1907   if (!ApertureReg.isValid())
1908     return false;
1909 
1910   auto CmpRes =
1911       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1912 
1913   // Coerce the type of the low half of the result so we can use merge_values.
1914   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1915 
1916   // TODO: Should we allow mismatched types but matching sizes in merges to
1917   // avoid the ptrtoint?
1918   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1919   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1920 
1921   MI.eraseFromParent();
1922   return true;
1923 }
1924 
1925 bool AMDGPULegalizerInfo::legalizeFrint(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   Register Src = MI.getOperand(1).getReg();
1929   LLT Ty = MRI.getType(Src);
1930   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1931 
1932   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1933   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1934 
1935   auto C1 = B.buildFConstant(Ty, C1Val);
1936   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1937 
1938   // TODO: Should this propagate fast-math-flags?
1939   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1940   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1941 
1942   auto C2 = B.buildFConstant(Ty, C2Val);
1943   auto Fabs = B.buildFAbs(Ty, Src);
1944 
1945   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1946   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1947   MI.eraseFromParent();
1948   return true;
1949 }
1950 
1951 bool AMDGPULegalizerInfo::legalizeFceil(
1952   MachineInstr &MI, MachineRegisterInfo &MRI,
1953   MachineIRBuilder &B) const {
1954 
1955   const LLT S1 = LLT::scalar(1);
1956   const LLT S64 = LLT::scalar(64);
1957 
1958   Register Src = MI.getOperand(1).getReg();
1959   assert(MRI.getType(Src) == S64);
1960 
1961   // result = trunc(src)
1962   // if (src > 0.0 && src != result)
1963   //   result += 1.0
1964 
1965   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1966 
1967   const auto Zero = B.buildFConstant(S64, 0.0);
1968   const auto One = B.buildFConstant(S64, 1.0);
1969   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1970   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1971   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1972   auto Add = B.buildSelect(S64, And, One, Zero);
1973 
1974   // TODO: Should this propagate fast-math-flags?
1975   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1976   return true;
1977 }
1978 
1979 bool AMDGPULegalizerInfo::legalizeFrem(
1980   MachineInstr &MI, MachineRegisterInfo &MRI,
1981   MachineIRBuilder &B) const {
1982     Register DstReg = MI.getOperand(0).getReg();
1983     Register Src0Reg = MI.getOperand(1).getReg();
1984     Register Src1Reg = MI.getOperand(2).getReg();
1985     auto Flags = MI.getFlags();
1986     LLT Ty = MRI.getType(DstReg);
1987 
1988     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1989     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1990     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1991     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1992     MI.eraseFromParent();
1993     return true;
1994 }
1995 
1996 static MachineInstrBuilder extractF64Exponent(Register Hi,
1997                                               MachineIRBuilder &B) {
1998   const unsigned FractBits = 52;
1999   const unsigned ExpBits = 11;
2000   LLT S32 = LLT::scalar(32);
2001 
2002   auto Const0 = B.buildConstant(S32, FractBits - 32);
2003   auto Const1 = B.buildConstant(S32, ExpBits);
2004 
2005   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2006     .addUse(Hi)
2007     .addUse(Const0.getReg(0))
2008     .addUse(Const1.getReg(0));
2009 
2010   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016   const LLT S1 = LLT::scalar(1);
2017   const LLT S32 = LLT::scalar(32);
2018   const LLT S64 = LLT::scalar(64);
2019 
2020   Register Src = MI.getOperand(1).getReg();
2021   assert(MRI.getType(Src) == S64);
2022 
2023   // TODO: Should this use extract since the low half is unused?
2024   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2025   Register Hi = Unmerge.getReg(1);
2026 
2027   // Extract the upper half, since this is where we will find the sign and
2028   // exponent.
2029   auto Exp = extractF64Exponent(Hi, B);
2030 
2031   const unsigned FractBits = 52;
2032 
2033   // Extract the sign bit.
2034   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2035   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2036 
2037   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2038 
2039   const auto Zero32 = B.buildConstant(S32, 0);
2040 
2041   // Extend back to 64-bits.
2042   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2043 
2044   auto Shr = B.buildAShr(S64, FractMask, Exp);
2045   auto Not = B.buildNot(S64, Shr);
2046   auto Tmp0 = B.buildAnd(S64, Src, Not);
2047   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2048 
2049   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2050   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2051 
2052   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2053   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2054   MI.eraseFromParent();
2055   return true;
2056 }
2057 
2058 bool AMDGPULegalizerInfo::legalizeITOFP(
2059   MachineInstr &MI, MachineRegisterInfo &MRI,
2060   MachineIRBuilder &B, bool Signed) const {
2061 
2062   Register Dst = MI.getOperand(0).getReg();
2063   Register Src = MI.getOperand(1).getReg();
2064 
2065   const LLT S64 = LLT::scalar(64);
2066   const LLT S32 = LLT::scalar(32);
2067 
2068   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2069 
2070   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2071 
2072   auto CvtHi = Signed ?
2073     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2074     B.buildUITOFP(S64, Unmerge.getReg(1));
2075 
2076   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2077 
2078   auto ThirtyTwo = B.buildConstant(S32, 32);
2079   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080     .addUse(CvtHi.getReg(0))
2081     .addUse(ThirtyTwo.getReg(0));
2082 
2083   // TODO: Should this propagate fast-math-flags?
2084   B.buildFAdd(Dst, LdExp, CvtLo);
2085   MI.eraseFromParent();
2086   return true;
2087 }
2088 
2089 // TODO: Copied from DAG implementation. Verify logic and document how this
2090 // actually works.
2091 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2092                                         MachineRegisterInfo &MRI,
2093                                         MachineIRBuilder &B,
2094                                         bool Signed) const {
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Src = MI.getOperand(1).getReg();
2098 
2099   const LLT S64 = LLT::scalar(64);
2100   const LLT S32 = LLT::scalar(32);
2101 
2102   const LLT SrcLT = MRI.getType(Src);
2103   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2104 
2105   unsigned Flags = MI.getFlags();
2106 
2107   // The basic idea of converting a floating point number into a pair of 32-bit
2108   // integers is illustrated as follows:
2109   //
2110   //     tf := trunc(val);
2111   //    hif := floor(tf * 2^-32);
2112   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2113   //     hi := fptoi(hif);
2114   //     lo := fptoi(lof);
2115   //
2116   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2117   MachineInstrBuilder Sign;
2118   if (Signed && SrcLT == S32) {
2119     // However, a 32-bit floating point number has only 23 bits mantissa and
2120     // it's not enough to hold all the significant bits of `lof` if val is
2121     // negative. To avoid the loss of precision, We need to take the absolute
2122     // value after truncating and flip the result back based on the original
2123     // signedness.
2124     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2125     Trunc = B.buildFAbs(S32, Trunc, Flags);
2126   }
2127   MachineInstrBuilder K0, K1;
2128   if (SrcLT == S64) {
2129     K0 = B.buildFConstant(S64,
2130                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2131     K1 = B.buildFConstant(S64,
2132                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2133   } else {
2134     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2135     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2136   }
2137 
2138   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2139   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2140   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2141 
2142   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2143                                      : B.buildFPTOUI(S32, FloorMul);
2144   auto Lo = B.buildFPTOUI(S32, Fma);
2145 
2146   if (Signed && SrcLT == S32) {
2147     // Flip the result based on the signedness, which is either all 0s or 1s.
2148     Sign = B.buildMerge(S64, {Sign, Sign});
2149     // r := xor({lo, hi}, sign) - sign;
2150     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2151   } else
2152     B.buildMerge(Dst, {Lo, Hi});
2153   MI.eraseFromParent();
2154 
2155   return true;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2159                                                MachineInstr &MI) const {
2160   MachineFunction &MF = Helper.MIRBuilder.getMF();
2161   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2162 
2163   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2164                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2165 
2166   // With ieee_mode disabled, the instructions have the correct behavior
2167   // already for G_FMINNUM/G_FMAXNUM
2168   if (!MFI->getMode().IEEE)
2169     return !IsIEEEOp;
2170 
2171   if (IsIEEEOp)
2172     return true;
2173 
2174   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2175 }
2176 
2177 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2178   MachineInstr &MI, MachineRegisterInfo &MRI,
2179   MachineIRBuilder &B) const {
2180   // TODO: Should move some of this into LegalizerHelper.
2181 
2182   // TODO: Promote dynamic indexing of s16 to s32
2183 
2184   // FIXME: Artifact combiner probably should have replaced the truncated
2185   // constant before this, so we shouldn't need
2186   // getConstantVRegValWithLookThrough.
2187   Optional<ValueAndVReg> MaybeIdxVal =
2188       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2189   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2190     return true;
2191   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2192 
2193   Register Dst = MI.getOperand(0).getReg();
2194   Register Vec = MI.getOperand(1).getReg();
2195 
2196   LLT VecTy = MRI.getType(Vec);
2197   LLT EltTy = VecTy.getElementType();
2198   assert(EltTy == MRI.getType(Dst));
2199 
2200   if (IdxVal < VecTy.getNumElements())
2201     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2202   else
2203     B.buildUndef(Dst);
2204 
2205   MI.eraseFromParent();
2206   return true;
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2210   MachineInstr &MI, MachineRegisterInfo &MRI,
2211   MachineIRBuilder &B) const {
2212   // TODO: Should move some of this into LegalizerHelper.
2213 
2214   // TODO: Promote dynamic indexing of s16 to s32
2215 
2216   // FIXME: Artifact combiner probably should have replaced the truncated
2217   // constant before this, so we shouldn't need
2218   // getConstantVRegValWithLookThrough.
2219   Optional<ValueAndVReg> MaybeIdxVal =
2220       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2221   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2222     return true;
2223 
2224   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2225   Register Dst = MI.getOperand(0).getReg();
2226   Register Vec = MI.getOperand(1).getReg();
2227   Register Ins = MI.getOperand(2).getReg();
2228 
2229   LLT VecTy = MRI.getType(Vec);
2230   LLT EltTy = VecTy.getElementType();
2231   assert(EltTy == MRI.getType(Ins));
2232 
2233   if (IdxVal < VecTy.getNumElements())
2234     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2235   else
2236     B.buildUndef(Dst);
2237 
2238   MI.eraseFromParent();
2239   return true;
2240 }
2241 
2242 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2243   MachineInstr &MI, MachineRegisterInfo &MRI,
2244   MachineIRBuilder &B) const {
2245   const LLT V2S16 = LLT::fixed_vector(2, 16);
2246 
2247   Register Dst = MI.getOperand(0).getReg();
2248   Register Src0 = MI.getOperand(1).getReg();
2249   LLT DstTy = MRI.getType(Dst);
2250   LLT SrcTy = MRI.getType(Src0);
2251 
2252   if (SrcTy == V2S16 && DstTy == V2S16 &&
2253       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2254     return true;
2255 
2256   MachineIRBuilder HelperBuilder(MI);
2257   GISelObserverWrapper DummyObserver;
2258   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2259   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2260 }
2261 
2262 bool AMDGPULegalizerInfo::legalizeSinCos(
2263   MachineInstr &MI, MachineRegisterInfo &MRI,
2264   MachineIRBuilder &B) const {
2265 
2266   Register DstReg = MI.getOperand(0).getReg();
2267   Register SrcReg = MI.getOperand(1).getReg();
2268   LLT Ty = MRI.getType(DstReg);
2269   unsigned Flags = MI.getFlags();
2270 
2271   Register TrigVal;
2272   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2273   if (ST.hasTrigReducedRange()) {
2274     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2275     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2276       .addUse(MulVal.getReg(0))
2277       .setMIFlags(Flags).getReg(0);
2278   } else
2279     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2280 
2281   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2282     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2283   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2284     .addUse(TrigVal)
2285     .setMIFlags(Flags);
2286   MI.eraseFromParent();
2287   return true;
2288 }
2289 
2290 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2291                                                   MachineIRBuilder &B,
2292                                                   const GlobalValue *GV,
2293                                                   int64_t Offset,
2294                                                   unsigned GAFlags) const {
2295   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2296   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2297   // to the following code sequence:
2298   //
2299   // For constant address space:
2300   //   s_getpc_b64 s[0:1]
2301   //   s_add_u32 s0, s0, $symbol
2302   //   s_addc_u32 s1, s1, 0
2303   //
2304   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2305   //   a fixup or relocation is emitted to replace $symbol with a literal
2306   //   constant, which is a pc-relative offset from the encoding of the $symbol
2307   //   operand to the global variable.
2308   //
2309   // For global address space:
2310   //   s_getpc_b64 s[0:1]
2311   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2312   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2313   //
2314   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2315   //   fixups or relocations are emitted to replace $symbol@*@lo and
2316   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2317   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2318   //   operand to the global variable.
2319   //
2320   // What we want here is an offset from the value returned by s_getpc
2321   // (which is the address of the s_add_u32 instruction) to the global
2322   // variable, but since the encoding of $symbol starts 4 bytes after the start
2323   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2324   // small. This requires us to add 4 to the global variable offset in order to
2325   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2326   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2327   // instruction.
2328 
2329   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2330 
2331   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2332     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2333 
2334   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2335     .addDef(PCReg);
2336 
2337   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2338   if (GAFlags == SIInstrInfo::MO_NONE)
2339     MIB.addImm(0);
2340   else
2341     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2342 
2343   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2344 
2345   if (PtrTy.getSizeInBits() == 32)
2346     B.buildExtract(DstReg, PCReg, 0);
2347   return true;
2348  }
2349 
2350 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2351   MachineInstr &MI, MachineRegisterInfo &MRI,
2352   MachineIRBuilder &B) const {
2353   Register DstReg = MI.getOperand(0).getReg();
2354   LLT Ty = MRI.getType(DstReg);
2355   unsigned AS = Ty.getAddressSpace();
2356 
2357   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2358   MachineFunction &MF = B.getMF();
2359   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2360 
2361   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2362     if (!MFI->isModuleEntryFunction() &&
2363         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2364       const Function &Fn = MF.getFunction();
2365       DiagnosticInfoUnsupported BadLDSDecl(
2366         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2367         DS_Warning);
2368       Fn.getContext().diagnose(BadLDSDecl);
2369 
2370       // We currently don't have a way to correctly allocate LDS objects that
2371       // aren't directly associated with a kernel. We do force inlining of
2372       // functions that use local objects. However, if these dead functions are
2373       // not eliminated, we don't want a compile time error. Just emit a warning
2374       // and a trap, since there should be no callable path here.
2375       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2376       B.buildUndef(DstReg);
2377       MI.eraseFromParent();
2378       return true;
2379     }
2380 
2381     // TODO: We could emit code to handle the initialization somewhere.
2382     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2383       const SITargetLowering *TLI = ST.getTargetLowering();
2384       if (!TLI->shouldUseLDSConstAddress(GV)) {
2385         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2386         return true; // Leave in place;
2387       }
2388 
2389       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2390         Type *Ty = GV->getValueType();
2391         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2392         // zero-sized type in other languages to declare the dynamic shared
2393         // memory which size is not known at the compile time. They will be
2394         // allocated by the runtime and placed directly after the static
2395         // allocated ones. They all share the same offset.
2396         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2397           // Adjust alignment for that dynamic shared memory array.
2398           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2399           LLT S32 = LLT::scalar(32);
2400           auto Sz =
2401               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2402           B.buildIntToPtr(DstReg, Sz);
2403           MI.eraseFromParent();
2404           return true;
2405         }
2406       }
2407 
2408       B.buildConstant(
2409           DstReg,
2410           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2411       MI.eraseFromParent();
2412       return true;
2413     }
2414 
2415     const Function &Fn = MF.getFunction();
2416     DiagnosticInfoUnsupported BadInit(
2417       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2418     Fn.getContext().diagnose(BadInit);
2419     return true;
2420   }
2421 
2422   const SITargetLowering *TLI = ST.getTargetLowering();
2423 
2424   if (TLI->shouldEmitFixup(GV)) {
2425     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2426     MI.eraseFromParent();
2427     return true;
2428   }
2429 
2430   if (TLI->shouldEmitPCReloc(GV)) {
2431     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2432     MI.eraseFromParent();
2433     return true;
2434   }
2435 
2436   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2437   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2438 
2439   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2440       MachinePointerInfo::getGOT(MF),
2441       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2442           MachineMemOperand::MOInvariant,
2443       8 /*Size*/, Align(8));
2444 
2445   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2446 
2447   if (Ty.getSizeInBits() == 32) {
2448     // Truncate if this is a 32-bit constant adrdess.
2449     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2450     B.buildExtract(DstReg, Load, 0);
2451   } else
2452     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2453 
2454   MI.eraseFromParent();
2455   return true;
2456 }
2457 
2458 static LLT widenToNextPowerOf2(LLT Ty) {
2459   if (Ty.isVector())
2460     return Ty.changeElementCount(
2461         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2462   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2463 }
2464 
2465 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2466                                        MachineInstr &MI) const {
2467   MachineIRBuilder &B = Helper.MIRBuilder;
2468   MachineRegisterInfo &MRI = *B.getMRI();
2469   GISelChangeObserver &Observer = Helper.Observer;
2470 
2471   Register PtrReg = MI.getOperand(1).getReg();
2472   LLT PtrTy = MRI.getType(PtrReg);
2473   unsigned AddrSpace = PtrTy.getAddressSpace();
2474 
2475   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2476     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2477     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2478     Observer.changingInstr(MI);
2479     MI.getOperand(1).setReg(Cast.getReg(0));
2480     Observer.changedInstr(MI);
2481     return true;
2482   }
2483 
2484   if (MI.getOpcode() != AMDGPU::G_LOAD)
2485     return false;
2486 
2487   Register ValReg = MI.getOperand(0).getReg();
2488   LLT ValTy = MRI.getType(ValReg);
2489 
2490   MachineMemOperand *MMO = *MI.memoperands_begin();
2491   const unsigned ValSize = ValTy.getSizeInBits();
2492   const LLT MemTy = MMO->getMemoryType();
2493   const Align MemAlign = MMO->getAlign();
2494   const unsigned MemSize = MemTy.getSizeInBits();
2495   const unsigned AlignInBits = 8 * MemAlign.value();
2496 
2497   // Widen non-power-of-2 loads to the alignment if needed
2498   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2499     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2500 
2501     // This was already the correct extending load result type, so just adjust
2502     // the memory type.
2503     if (WideMemSize == ValSize) {
2504       MachineFunction &MF = B.getMF();
2505 
2506       MachineMemOperand *WideMMO =
2507           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2508       Observer.changingInstr(MI);
2509       MI.setMemRefs(MF, {WideMMO});
2510       Observer.changedInstr(MI);
2511       return true;
2512     }
2513 
2514     // Don't bother handling edge case that should probably never be produced.
2515     if (ValSize > WideMemSize)
2516       return false;
2517 
2518     LLT WideTy = widenToNextPowerOf2(ValTy);
2519 
2520     Register WideLoad;
2521     if (!WideTy.isVector()) {
2522       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2523       B.buildTrunc(ValReg, WideLoad).getReg(0);
2524     } else {
2525       // Extract the subvector.
2526 
2527       if (isRegisterType(ValTy)) {
2528         // If this a case where G_EXTRACT is legal, use it.
2529         // (e.g. <3 x s32> -> <4 x s32>)
2530         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2531         B.buildExtract(ValReg, WideLoad, 0);
2532       } else {
2533         // For cases where the widened type isn't a nice register value, unmerge
2534         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2535         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2536         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2537         B.setInsertPt(B.getMBB(), MI.getIterator());
2538         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2539       }
2540     }
2541 
2542     MI.eraseFromParent();
2543     return true;
2544   }
2545 
2546   return false;
2547 }
2548 
2549 bool AMDGPULegalizerInfo::legalizeFMad(
2550   MachineInstr &MI, MachineRegisterInfo &MRI,
2551   MachineIRBuilder &B) const {
2552   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2553   assert(Ty.isScalar());
2554 
2555   MachineFunction &MF = B.getMF();
2556   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2557 
2558   // TODO: Always legal with future ftz flag.
2559   // FIXME: Do we need just output?
2560   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2561     return true;
2562   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2563     return true;
2564 
2565   MachineIRBuilder HelperBuilder(MI);
2566   GISelObserverWrapper DummyObserver;
2567   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2568   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2569 }
2570 
2571 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2572   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2573   Register DstReg = MI.getOperand(0).getReg();
2574   Register PtrReg = MI.getOperand(1).getReg();
2575   Register CmpVal = MI.getOperand(2).getReg();
2576   Register NewVal = MI.getOperand(3).getReg();
2577 
2578   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2579          "this should not have been custom lowered");
2580 
2581   LLT ValTy = MRI.getType(CmpVal);
2582   LLT VecTy = LLT::fixed_vector(2, ValTy);
2583 
2584   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2585 
2586   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2587     .addDef(DstReg)
2588     .addUse(PtrReg)
2589     .addUse(PackedVal)
2590     .setMemRefs(MI.memoperands());
2591 
2592   MI.eraseFromParent();
2593   return true;
2594 }
2595 
2596 bool AMDGPULegalizerInfo::legalizeFlog(
2597   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2598   Register Dst = MI.getOperand(0).getReg();
2599   Register Src = MI.getOperand(1).getReg();
2600   LLT Ty = B.getMRI()->getType(Dst);
2601   unsigned Flags = MI.getFlags();
2602 
2603   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2604   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2605 
2606   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2607   MI.eraseFromParent();
2608   return true;
2609 }
2610 
2611 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2612                                        MachineIRBuilder &B) const {
2613   Register Dst = MI.getOperand(0).getReg();
2614   Register Src = MI.getOperand(1).getReg();
2615   unsigned Flags = MI.getFlags();
2616   LLT Ty = B.getMRI()->getType(Dst);
2617 
2618   auto K = B.buildFConstant(Ty, numbers::log2e);
2619   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2620   B.buildFExp2(Dst, Mul, Flags);
2621   MI.eraseFromParent();
2622   return true;
2623 }
2624 
2625 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2626                                        MachineIRBuilder &B) const {
2627   Register Dst = MI.getOperand(0).getReg();
2628   Register Src0 = MI.getOperand(1).getReg();
2629   Register Src1 = MI.getOperand(2).getReg();
2630   unsigned Flags = MI.getFlags();
2631   LLT Ty = B.getMRI()->getType(Dst);
2632   const LLT S16 = LLT::scalar(16);
2633   const LLT S32 = LLT::scalar(32);
2634 
2635   if (Ty == S32) {
2636     auto Log = B.buildFLog2(S32, Src0, Flags);
2637     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2638       .addUse(Log.getReg(0))
2639       .addUse(Src1)
2640       .setMIFlags(Flags);
2641     B.buildFExp2(Dst, Mul, Flags);
2642   } else if (Ty == S16) {
2643     // There's no f16 fmul_legacy, so we need to convert for it.
2644     auto Log = B.buildFLog2(S16, Src0, Flags);
2645     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2646     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2647     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2648       .addUse(Ext0.getReg(0))
2649       .addUse(Ext1.getReg(0))
2650       .setMIFlags(Flags);
2651 
2652     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2653   } else
2654     return false;
2655 
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
2660 // Find a source register, ignoring any possible source modifiers.
2661 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2662   Register ModSrc = OrigSrc;
2663   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2664     ModSrc = SrcFNeg->getOperand(1).getReg();
2665     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2666       ModSrc = SrcFAbs->getOperand(1).getReg();
2667   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2668     ModSrc = SrcFAbs->getOperand(1).getReg();
2669   return ModSrc;
2670 }
2671 
2672 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2673                                          MachineRegisterInfo &MRI,
2674                                          MachineIRBuilder &B) const {
2675 
2676   const LLT S1 = LLT::scalar(1);
2677   const LLT S64 = LLT::scalar(64);
2678   Register Dst = MI.getOperand(0).getReg();
2679   Register OrigSrc = MI.getOperand(1).getReg();
2680   unsigned Flags = MI.getFlags();
2681   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2682          "this should not have been custom lowered");
2683 
2684   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2685   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2686   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2687   // V_FRACT bug is:
2688   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2689   //
2690   // Convert floor(x) to (x - fract(x))
2691 
2692   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2693     .addUse(OrigSrc)
2694     .setMIFlags(Flags);
2695 
2696   // Give source modifier matching some assistance before obscuring a foldable
2697   // pattern.
2698 
2699   // TODO: We can avoid the neg on the fract? The input sign to fract
2700   // shouldn't matter?
2701   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2702 
2703   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2704 
2705   Register Min = MRI.createGenericVirtualRegister(S64);
2706 
2707   // We don't need to concern ourselves with the snan handling difference, so
2708   // use the one which will directly select.
2709   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2710   if (MFI->getMode().IEEE)
2711     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2712   else
2713     B.buildFMinNum(Min, Fract, Const, Flags);
2714 
2715   Register CorrectedFract = Min;
2716   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2717     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2718     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2719   }
2720 
2721   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2722   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2723 
2724   MI.eraseFromParent();
2725   return true;
2726 }
2727 
2728 // Turn an illegal packed v2s16 build vector into bit operations.
2729 // TODO: This should probably be a bitcast action in LegalizerHelper.
2730 bool AMDGPULegalizerInfo::legalizeBuildVector(
2731   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2732   Register Dst = MI.getOperand(0).getReg();
2733   const LLT S32 = LLT::scalar(32);
2734   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2735 
2736   Register Src0 = MI.getOperand(1).getReg();
2737   Register Src1 = MI.getOperand(2).getReg();
2738   assert(MRI.getType(Src0) == LLT::scalar(16));
2739 
2740   auto Merge = B.buildMerge(S32, {Src0, Src1});
2741   B.buildBitcast(Dst, Merge);
2742 
2743   MI.eraseFromParent();
2744   return true;
2745 }
2746 
2747 // Check that this is a G_XOR x, -1
2748 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2749   if (MI.getOpcode() != TargetOpcode::G_XOR)
2750     return false;
2751   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2752   return ConstVal && *ConstVal == -1;
2753 }
2754 
2755 // Return the use branch instruction, otherwise null if the usage is invalid.
2756 static MachineInstr *
2757 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2758                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2759   Register CondDef = MI.getOperand(0).getReg();
2760   if (!MRI.hasOneNonDBGUse(CondDef))
2761     return nullptr;
2762 
2763   MachineBasicBlock *Parent = MI.getParent();
2764   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2765 
2766   if (isNot(MRI, *UseMI)) {
2767     Register NegatedCond = UseMI->getOperand(0).getReg();
2768     if (!MRI.hasOneNonDBGUse(NegatedCond))
2769       return nullptr;
2770 
2771     // We're deleting the def of this value, so we need to remove it.
2772     UseMI->eraseFromParent();
2773 
2774     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2775     Negated = true;
2776   }
2777 
2778   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2779     return nullptr;
2780 
2781   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2782   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2783   if (Next == Parent->end()) {
2784     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2785     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2786       return nullptr;
2787     UncondBrTarget = &*NextMBB;
2788   } else {
2789     if (Next->getOpcode() != AMDGPU::G_BR)
2790       return nullptr;
2791     Br = &*Next;
2792     UncondBrTarget = Br->getOperand(0).getMBB();
2793   }
2794 
2795   return UseMI;
2796 }
2797 
2798 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2799                                          const ArgDescriptor *Arg,
2800                                          const TargetRegisterClass *ArgRC,
2801                                          LLT ArgTy) const {
2802   MCRegister SrcReg = Arg->getRegister();
2803   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2804   assert(DstReg.isVirtual() && "Virtual register expected");
2805 
2806   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2807                                              ArgTy);
2808   if (Arg->isMasked()) {
2809     // TODO: Should we try to emit this once in the entry block?
2810     const LLT S32 = LLT::scalar(32);
2811     const unsigned Mask = Arg->getMask();
2812     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2813 
2814     Register AndMaskSrc = LiveIn;
2815 
2816     if (Shift != 0) {
2817       auto ShiftAmt = B.buildConstant(S32, Shift);
2818       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2819     }
2820 
2821     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2822   } else {
2823     B.buildCopy(DstReg, LiveIn);
2824   }
2825 
2826   return true;
2827 }
2828 
2829 bool AMDGPULegalizerInfo::loadInputValue(
2830     Register DstReg, MachineIRBuilder &B,
2831     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2832   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2833   const ArgDescriptor *Arg;
2834   const TargetRegisterClass *ArgRC;
2835   LLT ArgTy;
2836   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2837 
2838   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2839     return false; // TODO: Handle these
2840   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2841 }
2842 
2843 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2844     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2845     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2846   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2847     return false;
2848 
2849   MI.eraseFromParent();
2850   return true;
2851 }
2852 
2853 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2854                                        MachineRegisterInfo &MRI,
2855                                        MachineIRBuilder &B) const {
2856   Register Dst = MI.getOperand(0).getReg();
2857   LLT DstTy = MRI.getType(Dst);
2858   LLT S16 = LLT::scalar(16);
2859   LLT S32 = LLT::scalar(32);
2860   LLT S64 = LLT::scalar(64);
2861 
2862   if (DstTy == S16)
2863     return legalizeFDIV16(MI, MRI, B);
2864   if (DstTy == S32)
2865     return legalizeFDIV32(MI, MRI, B);
2866   if (DstTy == S64)
2867     return legalizeFDIV64(MI, MRI, B);
2868 
2869   return false;
2870 }
2871 
2872 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2873                                                         Register DstDivReg,
2874                                                         Register DstRemReg,
2875                                                         Register X,
2876                                                         Register Y) const {
2877   const LLT S1 = LLT::scalar(1);
2878   const LLT S32 = LLT::scalar(32);
2879 
2880   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2881   // algorithm used here.
2882 
2883   // Initial estimate of inv(y).
2884   auto FloatY = B.buildUITOFP(S32, Y);
2885   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2886   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2887   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2888   auto Z = B.buildFPTOUI(S32, ScaledY);
2889 
2890   // One round of UNR.
2891   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2892   auto NegYZ = B.buildMul(S32, NegY, Z);
2893   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2894 
2895   // Quotient/remainder estimate.
2896   auto Q = B.buildUMulH(S32, X, Z);
2897   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2898 
2899   // First quotient/remainder refinement.
2900   auto One = B.buildConstant(S32, 1);
2901   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2902   if (DstDivReg)
2903     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2904   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2905 
2906   // Second quotient/remainder refinement.
2907   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2908   if (DstDivReg)
2909     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2910 
2911   if (DstRemReg)
2912     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2913 }
2914 
2915 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2916 //
2917 // Return lo, hi of result
2918 //
2919 // %cvt.lo = G_UITOFP Val.lo
2920 // %cvt.hi = G_UITOFP Val.hi
2921 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2922 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2923 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2924 // %mul2 = G_FMUL %mul1, 2**(-32)
2925 // %trunc = G_INTRINSIC_TRUNC %mul2
2926 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2927 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2928 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2929                                                        Register Val) {
2930   const LLT S32 = LLT::scalar(32);
2931   auto Unmerge = B.buildUnmerge(S32, Val);
2932 
2933   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2934   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2935 
2936   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2937                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2938 
2939   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2940   auto Mul1 =
2941       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2942 
2943   // 2**(-32)
2944   auto Mul2 =
2945       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2946   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2947 
2948   // -(2**32)
2949   auto Mad2 = B.buildFMAD(S32, Trunc,
2950                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2951 
2952   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2953   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2954 
2955   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2956 }
2957 
2958 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2959                                                         Register DstDivReg,
2960                                                         Register DstRemReg,
2961                                                         Register Numer,
2962                                                         Register Denom) const {
2963   const LLT S32 = LLT::scalar(32);
2964   const LLT S64 = LLT::scalar(64);
2965   const LLT S1 = LLT::scalar(1);
2966   Register RcpLo, RcpHi;
2967 
2968   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2969 
2970   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2971 
2972   auto Zero64 = B.buildConstant(S64, 0);
2973   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2974 
2975   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2976   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2977 
2978   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2979   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2980   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2981 
2982   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2983   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2984   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2985   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2986 
2987   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2988   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2989   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2990   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2991   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2992 
2993   auto Zero32 = B.buildConstant(S32, 0);
2994   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2995   auto Add2_HiC =
2996       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2997   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2998   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2999 
3000   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3001   Register NumerLo = UnmergeNumer.getReg(0);
3002   Register NumerHi = UnmergeNumer.getReg(1);
3003 
3004   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3005   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3006   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3007   Register Mul3_Lo = UnmergeMul3.getReg(0);
3008   Register Mul3_Hi = UnmergeMul3.getReg(1);
3009   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3010   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3011   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3012   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3013 
3014   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3015   Register DenomLo = UnmergeDenom.getReg(0);
3016   Register DenomHi = UnmergeDenom.getReg(1);
3017 
3018   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3019   auto C1 = B.buildSExt(S32, CmpHi);
3020 
3021   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3022   auto C2 = B.buildSExt(S32, CmpLo);
3023 
3024   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3025   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3026 
3027   // TODO: Here and below portions of the code can be enclosed into if/endif.
3028   // Currently control flow is unconditional and we have 4 selects after
3029   // potential endif to substitute PHIs.
3030 
3031   // if C3 != 0 ...
3032   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3033   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3034   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3035   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3036 
3037   auto One64 = B.buildConstant(S64, 1);
3038   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3039 
3040   auto C4 =
3041       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3042   auto C5 =
3043       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3044   auto C6 = B.buildSelect(
3045       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3046 
3047   // if (C6 != 0)
3048   auto Add4 = B.buildAdd(S64, Add3, One64);
3049   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3050 
3051   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3052   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3053   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3054 
3055   // endif C6
3056   // endif C3
3057 
3058   if (DstDivReg) {
3059     auto Sel1 = B.buildSelect(
3060         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3061     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3062                   Sel1, MulHi3);
3063   }
3064 
3065   if (DstRemReg) {
3066     auto Sel2 = B.buildSelect(
3067         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3068     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3069                   Sel2, Sub1);
3070   }
3071 }
3072 
3073 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3074                                                   MachineRegisterInfo &MRI,
3075                                                   MachineIRBuilder &B) const {
3076   Register DstDivReg, DstRemReg;
3077   switch (MI.getOpcode()) {
3078   default:
3079     llvm_unreachable("Unexpected opcode!");
3080   case AMDGPU::G_UDIV: {
3081     DstDivReg = MI.getOperand(0).getReg();
3082     break;
3083   }
3084   case AMDGPU::G_UREM: {
3085     DstRemReg = MI.getOperand(0).getReg();
3086     break;
3087   }
3088   case AMDGPU::G_UDIVREM: {
3089     DstDivReg = MI.getOperand(0).getReg();
3090     DstRemReg = MI.getOperand(1).getReg();
3091     break;
3092   }
3093   }
3094 
3095   const LLT S64 = LLT::scalar(64);
3096   const LLT S32 = LLT::scalar(32);
3097   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3098   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3099   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3100   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3101 
3102   if (Ty == S32)
3103     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3104   else if (Ty == S64)
3105     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3106   else
3107     return false;
3108 
3109   MI.eraseFromParent();
3110   return true;
3111 }
3112 
3113 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3114                                                 MachineRegisterInfo &MRI,
3115                                                 MachineIRBuilder &B) const {
3116   const LLT S64 = LLT::scalar(64);
3117   const LLT S32 = LLT::scalar(32);
3118 
3119   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3120   if (Ty != S32 && Ty != S64)
3121     return false;
3122 
3123   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3124   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3125   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3126 
3127   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3128   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3129   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3130 
3131   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3132   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3133 
3134   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3135   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3136 
3137   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3138   switch (MI.getOpcode()) {
3139   default:
3140     llvm_unreachable("Unexpected opcode!");
3141   case AMDGPU::G_SDIV: {
3142     DstDivReg = MI.getOperand(0).getReg();
3143     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3144     break;
3145   }
3146   case AMDGPU::G_SREM: {
3147     DstRemReg = MI.getOperand(0).getReg();
3148     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3149     break;
3150   }
3151   case AMDGPU::G_SDIVREM: {
3152     DstDivReg = MI.getOperand(0).getReg();
3153     DstRemReg = MI.getOperand(1).getReg();
3154     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3155     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3156     break;
3157   }
3158   }
3159 
3160   if (Ty == S32)
3161     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3162   else
3163     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3164 
3165   if (DstDivReg) {
3166     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3167     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3168     B.buildSub(DstDivReg, SignXor, Sign);
3169   }
3170 
3171   if (DstRemReg) {
3172     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3173     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3174     B.buildSub(DstRemReg, SignXor, Sign);
3175   }
3176 
3177   MI.eraseFromParent();
3178   return true;
3179 }
3180 
3181 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3182                                                  MachineRegisterInfo &MRI,
3183                                                  MachineIRBuilder &B) const {
3184   Register Res = MI.getOperand(0).getReg();
3185   Register LHS = MI.getOperand(1).getReg();
3186   Register RHS = MI.getOperand(2).getReg();
3187   uint16_t Flags = MI.getFlags();
3188   LLT ResTy = MRI.getType(Res);
3189 
3190   const MachineFunction &MF = B.getMF();
3191   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3192                             MI.getFlag(MachineInstr::FmAfn);
3193 
3194   if (!AllowInaccurateRcp)
3195     return false;
3196 
3197   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3198     // 1 / x -> RCP(x)
3199     if (CLHS->isExactlyValue(1.0)) {
3200       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3201         .addUse(RHS)
3202         .setMIFlags(Flags);
3203 
3204       MI.eraseFromParent();
3205       return true;
3206     }
3207 
3208     // -1 / x -> RCP( FNEG(x) )
3209     if (CLHS->isExactlyValue(-1.0)) {
3210       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3211       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3212         .addUse(FNeg.getReg(0))
3213         .setMIFlags(Flags);
3214 
3215       MI.eraseFromParent();
3216       return true;
3217     }
3218   }
3219 
3220   // x / y -> x * (1.0 / y)
3221   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3222     .addUse(RHS)
3223     .setMIFlags(Flags);
3224   B.buildFMul(Res, LHS, RCP, Flags);
3225 
3226   MI.eraseFromParent();
3227   return true;
3228 }
3229 
3230 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3231                                                    MachineRegisterInfo &MRI,
3232                                                    MachineIRBuilder &B) const {
3233   Register Res = MI.getOperand(0).getReg();
3234   Register X = MI.getOperand(1).getReg();
3235   Register Y = MI.getOperand(2).getReg();
3236   uint16_t Flags = MI.getFlags();
3237   LLT ResTy = MRI.getType(Res);
3238 
3239   const MachineFunction &MF = B.getMF();
3240   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3241                             MI.getFlag(MachineInstr::FmAfn);
3242 
3243   if (!AllowInaccurateRcp)
3244     return false;
3245 
3246   auto NegY = B.buildFNeg(ResTy, Y);
3247   auto One = B.buildFConstant(ResTy, 1.0);
3248 
3249   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3250     .addUse(Y)
3251     .setMIFlags(Flags);
3252 
3253   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3254   R = B.buildFMA(ResTy, Tmp0, R, R);
3255 
3256   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3257   R = B.buildFMA(ResTy, Tmp1, R, R);
3258 
3259   auto Ret = B.buildFMul(ResTy, X, R);
3260   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3261 
3262   B.buildFMA(Res, Tmp2, R, Ret);
3263   MI.eraseFromParent();
3264   return true;
3265 }
3266 
3267 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3268                                          MachineRegisterInfo &MRI,
3269                                          MachineIRBuilder &B) const {
3270   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3271     return true;
3272 
3273   Register Res = MI.getOperand(0).getReg();
3274   Register LHS = MI.getOperand(1).getReg();
3275   Register RHS = MI.getOperand(2).getReg();
3276 
3277   uint16_t Flags = MI.getFlags();
3278 
3279   LLT S16 = LLT::scalar(16);
3280   LLT S32 = LLT::scalar(32);
3281 
3282   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3283   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3284 
3285   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3286     .addUse(RHSExt.getReg(0))
3287     .setMIFlags(Flags);
3288 
3289   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3290   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3291 
3292   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3293     .addUse(RDst.getReg(0))
3294     .addUse(RHS)
3295     .addUse(LHS)
3296     .setMIFlags(Flags);
3297 
3298   MI.eraseFromParent();
3299   return true;
3300 }
3301 
3302 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3303 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3304 static void toggleSPDenormMode(bool Enable,
3305                                MachineIRBuilder &B,
3306                                const GCNSubtarget &ST,
3307                                AMDGPU::SIModeRegisterDefaults Mode) {
3308   // Set SP denorm mode to this value.
3309   unsigned SPDenormMode =
3310     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3311 
3312   if (ST.hasDenormModeInst()) {
3313     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3314     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3315 
3316     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3317     B.buildInstr(AMDGPU::S_DENORM_MODE)
3318       .addImm(NewDenormModeValue);
3319 
3320   } else {
3321     // Select FP32 bit field in mode register.
3322     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3323                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3324                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3325 
3326     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3327       .addImm(SPDenormMode)
3328       .addImm(SPDenormModeBitField);
3329   }
3330 }
3331 
3332 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3333                                          MachineRegisterInfo &MRI,
3334                                          MachineIRBuilder &B) const {
3335   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3336     return true;
3337 
3338   Register Res = MI.getOperand(0).getReg();
3339   Register LHS = MI.getOperand(1).getReg();
3340   Register RHS = MI.getOperand(2).getReg();
3341   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3342   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3343 
3344   uint16_t Flags = MI.getFlags();
3345 
3346   LLT S32 = LLT::scalar(32);
3347   LLT S1 = LLT::scalar(1);
3348 
3349   auto One = B.buildFConstant(S32, 1.0f);
3350 
3351   auto DenominatorScaled =
3352     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3353       .addUse(LHS)
3354       .addUse(RHS)
3355       .addImm(0)
3356       .setMIFlags(Flags);
3357   auto NumeratorScaled =
3358     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3359       .addUse(LHS)
3360       .addUse(RHS)
3361       .addImm(1)
3362       .setMIFlags(Flags);
3363 
3364   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3365     .addUse(DenominatorScaled.getReg(0))
3366     .setMIFlags(Flags);
3367   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3368 
3369   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3370   // aren't modeled as reading it.
3371   if (!Mode.allFP32Denormals())
3372     toggleSPDenormMode(true, B, ST, Mode);
3373 
3374   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3375   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3376   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3377   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3378   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3379   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3380 
3381   if (!Mode.allFP32Denormals())
3382     toggleSPDenormMode(false, B, ST, Mode);
3383 
3384   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3385     .addUse(Fma4.getReg(0))
3386     .addUse(Fma1.getReg(0))
3387     .addUse(Fma3.getReg(0))
3388     .addUse(NumeratorScaled.getReg(1))
3389     .setMIFlags(Flags);
3390 
3391   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3392     .addUse(Fmas.getReg(0))
3393     .addUse(RHS)
3394     .addUse(LHS)
3395     .setMIFlags(Flags);
3396 
3397   MI.eraseFromParent();
3398   return true;
3399 }
3400 
3401 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3402                                          MachineRegisterInfo &MRI,
3403                                          MachineIRBuilder &B) const {
3404   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3405     return true;
3406 
3407   Register Res = MI.getOperand(0).getReg();
3408   Register LHS = MI.getOperand(1).getReg();
3409   Register RHS = MI.getOperand(2).getReg();
3410 
3411   uint16_t Flags = MI.getFlags();
3412 
3413   LLT S64 = LLT::scalar(64);
3414   LLT S1 = LLT::scalar(1);
3415 
3416   auto One = B.buildFConstant(S64, 1.0);
3417 
3418   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3419     .addUse(LHS)
3420     .addUse(RHS)
3421     .addImm(0)
3422     .setMIFlags(Flags);
3423 
3424   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3425 
3426   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3427     .addUse(DivScale0.getReg(0))
3428     .setMIFlags(Flags);
3429 
3430   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3431   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3432   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3433 
3434   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3435     .addUse(LHS)
3436     .addUse(RHS)
3437     .addImm(1)
3438     .setMIFlags(Flags);
3439 
3440   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3441   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3442   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3443 
3444   Register Scale;
3445   if (!ST.hasUsableDivScaleConditionOutput()) {
3446     // Workaround a hardware bug on SI where the condition output from div_scale
3447     // is not usable.
3448 
3449     LLT S32 = LLT::scalar(32);
3450 
3451     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3452     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3453     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3454     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3455 
3456     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3457                               Scale1Unmerge.getReg(1));
3458     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3459                               Scale0Unmerge.getReg(1));
3460     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3461   } else {
3462     Scale = DivScale1.getReg(1);
3463   }
3464 
3465   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3466     .addUse(Fma4.getReg(0))
3467     .addUse(Fma3.getReg(0))
3468     .addUse(Mul.getReg(0))
3469     .addUse(Scale)
3470     .setMIFlags(Flags);
3471 
3472   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3473     .addUse(Fmas.getReg(0))
3474     .addUse(RHS)
3475     .addUse(LHS)
3476     .setMIFlags(Flags);
3477 
3478   MI.eraseFromParent();
3479   return true;
3480 }
3481 
3482 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3483                                                  MachineRegisterInfo &MRI,
3484                                                  MachineIRBuilder &B) const {
3485   Register Res = MI.getOperand(0).getReg();
3486   Register LHS = MI.getOperand(2).getReg();
3487   Register RHS = MI.getOperand(3).getReg();
3488   uint16_t Flags = MI.getFlags();
3489 
3490   LLT S32 = LLT::scalar(32);
3491   LLT S1 = LLT::scalar(1);
3492 
3493   auto Abs = B.buildFAbs(S32, RHS, Flags);
3494   const APFloat C0Val(1.0f);
3495 
3496   auto C0 = B.buildConstant(S32, 0x6f800000);
3497   auto C1 = B.buildConstant(S32, 0x2f800000);
3498   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3499 
3500   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3501   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3502 
3503   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3504 
3505   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3506     .addUse(Mul0.getReg(0))
3507     .setMIFlags(Flags);
3508 
3509   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3510 
3511   B.buildFMul(Res, Sel, Mul1, Flags);
3512 
3513   MI.eraseFromParent();
3514   return true;
3515 }
3516 
3517 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3518 // FIXME: Why do we handle this one but not other removed instructions?
3519 //
3520 // Reciprocal square root.  The clamp prevents infinite results, clamping
3521 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3522 // +-max_float.
3523 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3524                                                     MachineRegisterInfo &MRI,
3525                                                     MachineIRBuilder &B) const {
3526   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3527     return true;
3528 
3529   Register Dst = MI.getOperand(0).getReg();
3530   Register Src = MI.getOperand(2).getReg();
3531   auto Flags = MI.getFlags();
3532 
3533   LLT Ty = MRI.getType(Dst);
3534 
3535   const fltSemantics *FltSemantics;
3536   if (Ty == LLT::scalar(32))
3537     FltSemantics = &APFloat::IEEEsingle();
3538   else if (Ty == LLT::scalar(64))
3539     FltSemantics = &APFloat::IEEEdouble();
3540   else
3541     return false;
3542 
3543   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3544     .addUse(Src)
3545     .setMIFlags(Flags);
3546 
3547   // We don't need to concern ourselves with the snan handling difference, since
3548   // the rsq quieted (or not) so use the one which will directly select.
3549   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3550   const bool UseIEEE = MFI->getMode().IEEE;
3551 
3552   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3553   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3554                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3555 
3556   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3557 
3558   if (UseIEEE)
3559     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3560   else
3561     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3562   MI.eraseFromParent();
3563   return true;
3564 }
3565 
3566 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3567   switch (IID) {
3568   case Intrinsic::amdgcn_ds_fadd:
3569     return AMDGPU::G_ATOMICRMW_FADD;
3570   case Intrinsic::amdgcn_ds_fmin:
3571     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3572   case Intrinsic::amdgcn_ds_fmax:
3573     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3574   default:
3575     llvm_unreachable("not a DS FP intrinsic");
3576   }
3577 }
3578 
3579 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3580                                                       MachineInstr &MI,
3581                                                       Intrinsic::ID IID) const {
3582   GISelChangeObserver &Observer = Helper.Observer;
3583   Observer.changingInstr(MI);
3584 
3585   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3586 
3587   // The remaining operands were used to set fields in the MemOperand on
3588   // construction.
3589   for (int I = 6; I > 3; --I)
3590     MI.RemoveOperand(I);
3591 
3592   MI.RemoveOperand(1); // Remove the intrinsic ID.
3593   Observer.changedInstr(MI);
3594   return true;
3595 }
3596 
3597 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3598                                             MachineRegisterInfo &MRI,
3599                                             MachineIRBuilder &B) const {
3600   uint64_t Offset =
3601     ST.getTargetLowering()->getImplicitParameterOffset(
3602       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3603   LLT DstTy = MRI.getType(DstReg);
3604   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3605 
3606   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3607   if (!loadInputValue(KernargPtrReg, B,
3608                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3609     return false;
3610 
3611   // FIXME: This should be nuw
3612   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3613   return true;
3614 }
3615 
3616 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3617                                                  MachineRegisterInfo &MRI,
3618                                                  MachineIRBuilder &B) const {
3619   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3620   if (!MFI->isEntryFunction()) {
3621     return legalizePreloadedArgIntrin(MI, MRI, B,
3622                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3623   }
3624 
3625   Register DstReg = MI.getOperand(0).getReg();
3626   if (!getImplicitArgPtr(DstReg, MRI, B))
3627     return false;
3628 
3629   MI.eraseFromParent();
3630   return true;
3631 }
3632 
3633 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3634                                               MachineRegisterInfo &MRI,
3635                                               MachineIRBuilder &B,
3636                                               unsigned AddrSpace) const {
3637   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3638   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3639   Register Hi32 = Unmerge.getReg(1);
3640 
3641   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3642   MI.eraseFromParent();
3643   return true;
3644 }
3645 
3646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3647 // offset (the offset that is included in bounds checking and swizzling, to be
3648 // split between the instruction's voffset and immoffset fields) and soffset
3649 // (the offset that is excluded from bounds checking and swizzling, to go in
3650 // the instruction's soffset field).  This function takes the first kind of
3651 // offset and figures out how to split it between voffset and immoffset.
3652 std::tuple<Register, unsigned, unsigned>
3653 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3654                                         Register OrigOffset) const {
3655   const unsigned MaxImm = 4095;
3656   Register BaseReg;
3657   unsigned TotalConstOffset;
3658   const LLT S32 = LLT::scalar(32);
3659   MachineRegisterInfo &MRI = *B.getMRI();
3660 
3661   std::tie(BaseReg, TotalConstOffset) =
3662       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3663 
3664   unsigned ImmOffset = TotalConstOffset;
3665 
3666   // If BaseReg is a pointer, convert it to int.
3667   if (MRI.getType(BaseReg).isPointer())
3668     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3669 
3670   // If the immediate value is too big for the immoffset field, put the value
3671   // and -4096 into the immoffset field so that the value that is copied/added
3672   // for the voffset field is a multiple of 4096, and it stands more chance
3673   // of being CSEd with the copy/add for another similar load/store.
3674   // However, do not do that rounding down to a multiple of 4096 if that is a
3675   // negative number, as it appears to be illegal to have a negative offset
3676   // in the vgpr, even if adding the immediate offset makes it positive.
3677   unsigned Overflow = ImmOffset & ~MaxImm;
3678   ImmOffset -= Overflow;
3679   if ((int32_t)Overflow < 0) {
3680     Overflow += ImmOffset;
3681     ImmOffset = 0;
3682   }
3683 
3684   if (Overflow != 0) {
3685     if (!BaseReg) {
3686       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3687     } else {
3688       auto OverflowVal = B.buildConstant(S32, Overflow);
3689       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3690     }
3691   }
3692 
3693   if (!BaseReg)
3694     BaseReg = B.buildConstant(S32, 0).getReg(0);
3695 
3696   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3697 }
3698 
3699 /// Handle register layout difference for f16 images for some subtargets.
3700 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3701                                              MachineRegisterInfo &MRI,
3702                                              Register Reg,
3703                                              bool ImageStore) const {
3704   const LLT S16 = LLT::scalar(16);
3705   const LLT S32 = LLT::scalar(32);
3706   LLT StoreVT = MRI.getType(Reg);
3707   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3708 
3709   if (ST.hasUnpackedD16VMem()) {
3710     auto Unmerge = B.buildUnmerge(S16, Reg);
3711 
3712     SmallVector<Register, 4> WideRegs;
3713     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3714       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3715 
3716     int NumElts = StoreVT.getNumElements();
3717 
3718     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3719         .getReg(0);
3720   }
3721 
3722   if (ImageStore && ST.hasImageStoreD16Bug()) {
3723     if (StoreVT.getNumElements() == 2) {
3724       SmallVector<Register, 4> PackedRegs;
3725       Reg = B.buildBitcast(S32, Reg).getReg(0);
3726       PackedRegs.push_back(Reg);
3727       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3728       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3729           .getReg(0);
3730     }
3731 
3732     if (StoreVT.getNumElements() == 3) {
3733       SmallVector<Register, 4> PackedRegs;
3734       auto Unmerge = B.buildUnmerge(S16, Reg);
3735       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3736         PackedRegs.push_back(Unmerge.getReg(I));
3737       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3738       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3739       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3740     }
3741 
3742     if (StoreVT.getNumElements() == 4) {
3743       SmallVector<Register, 4> PackedRegs;
3744       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3745       auto Unmerge = B.buildUnmerge(S32, Reg);
3746       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3747         PackedRegs.push_back(Unmerge.getReg(I));
3748       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3749       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3750           .getReg(0);
3751     }
3752 
3753     llvm_unreachable("invalid data type");
3754   }
3755 
3756   return Reg;
3757 }
3758 
3759 Register AMDGPULegalizerInfo::fixStoreSourceType(
3760   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3761   MachineRegisterInfo *MRI = B.getMRI();
3762   LLT Ty = MRI->getType(VData);
3763 
3764   const LLT S16 = LLT::scalar(16);
3765 
3766   // Fixup illegal register types for i8 stores.
3767   if (Ty == LLT::scalar(8) || Ty == S16) {
3768     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3769     return AnyExt;
3770   }
3771 
3772   if (Ty.isVector()) {
3773     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3774       if (IsFormat)
3775         return handleD16VData(B, *MRI, VData);
3776     }
3777   }
3778 
3779   return VData;
3780 }
3781 
3782 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3783                                               MachineRegisterInfo &MRI,
3784                                               MachineIRBuilder &B,
3785                                               bool IsTyped,
3786                                               bool IsFormat) const {
3787   Register VData = MI.getOperand(1).getReg();
3788   LLT Ty = MRI.getType(VData);
3789   LLT EltTy = Ty.getScalarType();
3790   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3791   const LLT S32 = LLT::scalar(32);
3792 
3793   VData = fixStoreSourceType(B, VData, IsFormat);
3794   Register RSrc = MI.getOperand(2).getReg();
3795 
3796   MachineMemOperand *MMO = *MI.memoperands_begin();
3797   const int MemSize = MMO->getSize();
3798 
3799   unsigned ImmOffset;
3800   unsigned TotalOffset;
3801 
3802   // The typed intrinsics add an immediate after the registers.
3803   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3804 
3805   // The struct intrinsic variants add one additional operand over raw.
3806   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3807   Register VIndex;
3808   int OpOffset = 0;
3809   if (HasVIndex) {
3810     VIndex = MI.getOperand(3).getReg();
3811     OpOffset = 1;
3812   }
3813 
3814   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3815   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3816 
3817   unsigned Format = 0;
3818   if (IsTyped) {
3819     Format = MI.getOperand(5 + OpOffset).getImm();
3820     ++OpOffset;
3821   }
3822 
3823   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3824 
3825   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3826   if (TotalOffset != 0)
3827     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3828 
3829   unsigned Opc;
3830   if (IsTyped) {
3831     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3832                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3833   } else if (IsFormat) {
3834     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3835                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3836   } else {
3837     switch (MemSize) {
3838     case 1:
3839       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3840       break;
3841     case 2:
3842       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3843       break;
3844     default:
3845       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3846       break;
3847     }
3848   }
3849 
3850   if (!VIndex)
3851     VIndex = B.buildConstant(S32, 0).getReg(0);
3852 
3853   auto MIB = B.buildInstr(Opc)
3854     .addUse(VData)              // vdata
3855     .addUse(RSrc)               // rsrc
3856     .addUse(VIndex)             // vindex
3857     .addUse(VOffset)            // voffset
3858     .addUse(SOffset)            // soffset
3859     .addImm(ImmOffset);         // offset(imm)
3860 
3861   if (IsTyped)
3862     MIB.addImm(Format);
3863 
3864   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3865      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3866      .addMemOperand(MMO);
3867 
3868   MI.eraseFromParent();
3869   return true;
3870 }
3871 
3872 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3873                                              MachineRegisterInfo &MRI,
3874                                              MachineIRBuilder &B,
3875                                              bool IsFormat,
3876                                              bool IsTyped) const {
3877   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3878   MachineMemOperand *MMO = *MI.memoperands_begin();
3879   const int MemSize = MMO->getSize();
3880   const LLT S32 = LLT::scalar(32);
3881 
3882   Register Dst = MI.getOperand(0).getReg();
3883   Register RSrc = MI.getOperand(2).getReg();
3884 
3885   // The typed intrinsics add an immediate after the registers.
3886   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3887 
3888   // The struct intrinsic variants add one additional operand over raw.
3889   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3890   Register VIndex;
3891   int OpOffset = 0;
3892   if (HasVIndex) {
3893     VIndex = MI.getOperand(3).getReg();
3894     OpOffset = 1;
3895   }
3896 
3897   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3898   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3899 
3900   unsigned Format = 0;
3901   if (IsTyped) {
3902     Format = MI.getOperand(5 + OpOffset).getImm();
3903     ++OpOffset;
3904   }
3905 
3906   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3907   unsigned ImmOffset;
3908   unsigned TotalOffset;
3909 
3910   LLT Ty = MRI.getType(Dst);
3911   LLT EltTy = Ty.getScalarType();
3912   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3913   const bool Unpacked = ST.hasUnpackedD16VMem();
3914 
3915   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3916   if (TotalOffset != 0)
3917     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3918 
3919   unsigned Opc;
3920 
3921   if (IsTyped) {
3922     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3923                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3924   } else if (IsFormat) {
3925     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3926                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3927   } else {
3928     switch (MemSize) {
3929     case 1:
3930       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3931       break;
3932     case 2:
3933       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3934       break;
3935     default:
3936       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3937       break;
3938     }
3939   }
3940 
3941   Register LoadDstReg;
3942 
3943   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3944   LLT UnpackedTy = Ty.changeElementSize(32);
3945 
3946   if (IsExtLoad)
3947     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3948   else if (Unpacked && IsD16 && Ty.isVector())
3949     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3950   else
3951     LoadDstReg = Dst;
3952 
3953   if (!VIndex)
3954     VIndex = B.buildConstant(S32, 0).getReg(0);
3955 
3956   auto MIB = B.buildInstr(Opc)
3957     .addDef(LoadDstReg)         // vdata
3958     .addUse(RSrc)               // rsrc
3959     .addUse(VIndex)             // vindex
3960     .addUse(VOffset)            // voffset
3961     .addUse(SOffset)            // soffset
3962     .addImm(ImmOffset);         // offset(imm)
3963 
3964   if (IsTyped)
3965     MIB.addImm(Format);
3966 
3967   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3968      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3969      .addMemOperand(MMO);
3970 
3971   if (LoadDstReg != Dst) {
3972     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3973 
3974     // Widen result for extending loads was widened.
3975     if (IsExtLoad)
3976       B.buildTrunc(Dst, LoadDstReg);
3977     else {
3978       // Repack to original 16-bit vector result
3979       // FIXME: G_TRUNC should work, but legalization currently fails
3980       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3981       SmallVector<Register, 4> Repack;
3982       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3983         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3984       B.buildMerge(Dst, Repack);
3985     }
3986   }
3987 
3988   MI.eraseFromParent();
3989   return true;
3990 }
3991 
3992 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3993                                                MachineIRBuilder &B,
3994                                                bool IsInc) const {
3995   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3996                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3997   B.buildInstr(Opc)
3998     .addDef(MI.getOperand(0).getReg())
3999     .addUse(MI.getOperand(2).getReg())
4000     .addUse(MI.getOperand(3).getReg())
4001     .cloneMemRefs(MI);
4002   MI.eraseFromParent();
4003   return true;
4004 }
4005 
4006 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4007   switch (IntrID) {
4008   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4009   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4010     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4011   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4012   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4013     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4014   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4015   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4016     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4017   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4018   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4019     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4020   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4021   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4022     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4023   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4024   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4025     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4026   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4027   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4028     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4029   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4030   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4031     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4032   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4033   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4034     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4035   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4036   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4037     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4038   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4039   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4040     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4041   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4042   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4043     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4044   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4045   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4046     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4047   case Intrinsic::amdgcn_buffer_atomic_fadd:
4048   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4049   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4050     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4051   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4052   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4053     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4054   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4055   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4056     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4057   default:
4058     llvm_unreachable("unhandled atomic opcode");
4059   }
4060 }
4061 
4062 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4063                                                MachineIRBuilder &B,
4064                                                Intrinsic::ID IID) const {
4065   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4066                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4067   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4068 
4069   Register Dst;
4070 
4071   int OpOffset = 0;
4072   if (HasReturn) {
4073     // A few FP atomics do not support return values.
4074     Dst = MI.getOperand(0).getReg();
4075   } else {
4076     OpOffset = -1;
4077   }
4078 
4079   Register VData = MI.getOperand(2 + OpOffset).getReg();
4080   Register CmpVal;
4081 
4082   if (IsCmpSwap) {
4083     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4084     ++OpOffset;
4085   }
4086 
4087   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4088   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4089 
4090   // The struct intrinsic variants add one additional operand over raw.
4091   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4092   Register VIndex;
4093   if (HasVIndex) {
4094     VIndex = MI.getOperand(4 + OpOffset).getReg();
4095     ++OpOffset;
4096   }
4097 
4098   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4099   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4100   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4101 
4102   MachineMemOperand *MMO = *MI.memoperands_begin();
4103 
4104   unsigned ImmOffset;
4105   unsigned TotalOffset;
4106   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4107   if (TotalOffset != 0)
4108     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4109 
4110   if (!VIndex)
4111     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4112 
4113   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4114 
4115   if (HasReturn)
4116     MIB.addDef(Dst);
4117 
4118   MIB.addUse(VData); // vdata
4119 
4120   if (IsCmpSwap)
4121     MIB.addReg(CmpVal);
4122 
4123   MIB.addUse(RSrc)               // rsrc
4124      .addUse(VIndex)             // vindex
4125      .addUse(VOffset)            // voffset
4126      .addUse(SOffset)            // soffset
4127      .addImm(ImmOffset)          // offset(imm)
4128      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4129      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4130      .addMemOperand(MMO);
4131 
4132   MI.eraseFromParent();
4133   return true;
4134 }
4135 
4136 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4137 /// vector with s16 typed elements.
4138 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4139                                       SmallVectorImpl<Register> &PackedAddrs,
4140                                       unsigned ArgOffset,
4141                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4142                                       bool IsA16, bool IsG16) {
4143   const LLT S16 = LLT::scalar(16);
4144   const LLT V2S16 = LLT::fixed_vector(2, 16);
4145   auto EndIdx = Intr->VAddrEnd;
4146 
4147   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4148     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4149     if (!SrcOp.isReg())
4150       continue; // _L to _LZ may have eliminated this.
4151 
4152     Register AddrReg = SrcOp.getReg();
4153 
4154     if (I < Intr->GradientStart) {
4155       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4156       PackedAddrs.push_back(AddrReg);
4157     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4158                (I >= Intr->CoordStart && !IsA16)) {
4159       // Handle any gradient or coordinate operands that should not be packed
4160       PackedAddrs.push_back(AddrReg);
4161     } else {
4162       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4163       // derivatives dx/dh and dx/dv are packed with undef.
4164       if (((I + 1) >= EndIdx) ||
4165           ((Intr->NumGradients / 2) % 2 == 1 &&
4166            (I == static_cast<unsigned>(Intr->GradientStart +
4167                                        (Intr->NumGradients / 2) - 1) ||
4168             I == static_cast<unsigned>(Intr->GradientStart +
4169                                        Intr->NumGradients - 1))) ||
4170           // Check for _L to _LZ optimization
4171           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4172         PackedAddrs.push_back(
4173             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4174                 .getReg(0));
4175       } else {
4176         PackedAddrs.push_back(
4177             B.buildBuildVector(
4178                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4179                 .getReg(0));
4180         ++I;
4181       }
4182     }
4183   }
4184 }
4185 
4186 /// Convert from separate vaddr components to a single vector address register,
4187 /// and replace the remaining operands with $noreg.
4188 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4189                                      int DimIdx, int NumVAddrs) {
4190   const LLT S32 = LLT::scalar(32);
4191 
4192   SmallVector<Register, 8> AddrRegs;
4193   for (int I = 0; I != NumVAddrs; ++I) {
4194     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4195     if (SrcOp.isReg()) {
4196       AddrRegs.push_back(SrcOp.getReg());
4197       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4198     }
4199   }
4200 
4201   int NumAddrRegs = AddrRegs.size();
4202   if (NumAddrRegs != 1) {
4203     // Round up to 8 elements for v5-v7
4204     // FIXME: Missing intermediate sized register classes and instructions.
4205     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4206       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4207       auto Undef = B.buildUndef(S32);
4208       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4209       NumAddrRegs = RoundedNumRegs;
4210     }
4211 
4212     auto VAddr =
4213         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4214     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4215   }
4216 
4217   for (int I = 1; I != NumVAddrs; ++I) {
4218     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4219     if (SrcOp.isReg())
4220       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4221   }
4222 }
4223 
4224 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4225 ///
4226 /// Depending on the subtarget, load/store with 16-bit element data need to be
4227 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4228 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4229 /// registers.
4230 ///
4231 /// We don't want to directly select image instructions just yet, but also want
4232 /// to exposes all register repacking to the legalizer/combiners. We also don't
4233 /// want a selected instrution entering RegBankSelect. In order to avoid
4234 /// defining a multitude of intermediate image instructions, directly hack on
4235 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4236 /// now unnecessary arguments with $noreg.
4237 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4238     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4239     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4240 
4241   const unsigned NumDefs = MI.getNumExplicitDefs();
4242   const unsigned ArgOffset = NumDefs + 1;
4243   bool IsTFE = NumDefs == 2;
4244   // We are only processing the operands of d16 image operations on subtargets
4245   // that use the unpacked register layout, or need to repack the TFE result.
4246 
4247   // TODO: Do we need to guard against already legalized intrinsics?
4248   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4249       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4250 
4251   MachineRegisterInfo *MRI = B.getMRI();
4252   const LLT S32 = LLT::scalar(32);
4253   const LLT S16 = LLT::scalar(16);
4254   const LLT V2S16 = LLT::fixed_vector(2, 16);
4255 
4256   unsigned DMask = 0;
4257 
4258   // Check for 16 bit addresses and pack if true.
4259   LLT GradTy =
4260       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4261   LLT AddrTy =
4262       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4263   const bool IsG16 = GradTy == S16;
4264   const bool IsA16 = AddrTy == S16;
4265 
4266   int DMaskLanes = 0;
4267   if (!BaseOpcode->Atomic) {
4268     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4269     if (BaseOpcode->Gather4) {
4270       DMaskLanes = 4;
4271     } else if (DMask != 0) {
4272       DMaskLanes = countPopulation(DMask);
4273     } else if (!IsTFE && !BaseOpcode->Store) {
4274       // If dmask is 0, this is a no-op load. This can be eliminated.
4275       B.buildUndef(MI.getOperand(0));
4276       MI.eraseFromParent();
4277       return true;
4278     }
4279   }
4280 
4281   Observer.changingInstr(MI);
4282   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4283 
4284   unsigned NewOpcode = NumDefs == 0 ?
4285     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4286 
4287   // Track that we legalized this
4288   MI.setDesc(B.getTII().get(NewOpcode));
4289 
4290   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4291   // dmask to be at least 1 otherwise the instruction will fail
4292   if (IsTFE && DMask == 0) {
4293     DMask = 0x1;
4294     DMaskLanes = 1;
4295     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4296   }
4297 
4298   if (BaseOpcode->Atomic) {
4299     Register VData0 = MI.getOperand(2).getReg();
4300     LLT Ty = MRI->getType(VData0);
4301 
4302     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4303     if (Ty.isVector())
4304       return false;
4305 
4306     if (BaseOpcode->AtomicX2) {
4307       Register VData1 = MI.getOperand(3).getReg();
4308       // The two values are packed in one register.
4309       LLT PackedTy = LLT::fixed_vector(2, Ty);
4310       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4311       MI.getOperand(2).setReg(Concat.getReg(0));
4312       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4313     }
4314   }
4315 
4316   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4317 
4318   // Optimize _L to _LZ when _L is zero
4319   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4320           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4321     const ConstantFP *ConstantLod;
4322 
4323     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4324                  m_GFCst(ConstantLod))) {
4325       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4326         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4327         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4328             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4329                                                       Intr->Dim);
4330 
4331         // The starting indexes should remain in the same place.
4332         --CorrectedNumVAddrs;
4333 
4334         MI.getOperand(MI.getNumExplicitDefs())
4335             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4336         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4337         Intr = NewImageDimIntr;
4338       }
4339     }
4340   }
4341 
4342   // Optimize _mip away, when 'lod' is zero
4343   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4344     int64_t ConstantLod;
4345     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4346                  m_ICst(ConstantLod))) {
4347       if (ConstantLod == 0) {
4348         // TODO: Change intrinsic opcode and remove operand instead or replacing
4349         // it with 0, as the _L to _LZ handling is done above.
4350         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4351         --CorrectedNumVAddrs;
4352       }
4353     }
4354   }
4355 
4356   // Rewrite the addressing register layout before doing anything else.
4357   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4358     // 16 bit gradients are supported, but are tied to the A16 control
4359     // so both gradients and addresses must be 16 bit
4360     return false;
4361   }
4362 
4363   if (IsA16 && !ST.hasA16()) {
4364     // A16 not supported
4365     return false;
4366   }
4367 
4368   if (IsA16 || IsG16) {
4369     if (Intr->NumVAddrs > 1) {
4370       SmallVector<Register, 4> PackedRegs;
4371 
4372       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4373                                 IsG16);
4374 
4375       // See also below in the non-a16 branch
4376       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4377 
4378       if (!UseNSA && PackedRegs.size() > 1) {
4379         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4380         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4381         PackedRegs[0] = Concat.getReg(0);
4382         PackedRegs.resize(1);
4383       }
4384 
4385       const unsigned NumPacked = PackedRegs.size();
4386       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4387         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4388         if (!SrcOp.isReg()) {
4389           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4390           continue;
4391         }
4392 
4393         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4394 
4395         if (I - Intr->VAddrStart < NumPacked)
4396           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4397         else
4398           SrcOp.setReg(AMDGPU::NoRegister);
4399       }
4400     }
4401   } else {
4402     // If the register allocator cannot place the address registers contiguously
4403     // without introducing moves, then using the non-sequential address encoding
4404     // is always preferable, since it saves VALU instructions and is usually a
4405     // wash in terms of code size or even better.
4406     //
4407     // However, we currently have no way of hinting to the register allocator
4408     // that MIMG addresses should be placed contiguously when it is possible to
4409     // do so, so force non-NSA for the common 2-address case as a heuristic.
4410     //
4411     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4412     // allocation when possible.
4413     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4414 
4415     if (!UseNSA && Intr->NumVAddrs > 1)
4416       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4417                                Intr->NumVAddrs);
4418   }
4419 
4420   int Flags = 0;
4421   if (IsA16)
4422     Flags |= 1;
4423   if (IsG16)
4424     Flags |= 2;
4425   MI.addOperand(MachineOperand::CreateImm(Flags));
4426 
4427   if (BaseOpcode->Store) { // No TFE for stores?
4428     // TODO: Handle dmask trim
4429     Register VData = MI.getOperand(1).getReg();
4430     LLT Ty = MRI->getType(VData);
4431     if (!Ty.isVector() || Ty.getElementType() != S16)
4432       return true;
4433 
4434     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4435     if (RepackedReg != VData) {
4436       MI.getOperand(1).setReg(RepackedReg);
4437     }
4438 
4439     return true;
4440   }
4441 
4442   Register DstReg = MI.getOperand(0).getReg();
4443   LLT Ty = MRI->getType(DstReg);
4444   const LLT EltTy = Ty.getScalarType();
4445   const bool IsD16 = Ty.getScalarType() == S16;
4446   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4447 
4448   // Confirm that the return type is large enough for the dmask specified
4449   if (NumElts < DMaskLanes)
4450     return false;
4451 
4452   if (NumElts > 4 || DMaskLanes > 4)
4453     return false;
4454 
4455   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4456   const LLT AdjustedTy =
4457       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4458 
4459   // The raw dword aligned data component of the load. The only legal cases
4460   // where this matters should be when using the packed D16 format, for
4461   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4462   LLT RoundedTy;
4463 
4464   // S32 vector to to cover all data, plus TFE result element.
4465   LLT TFETy;
4466 
4467   // Register type to use for each loaded component. Will be S32 or V2S16.
4468   LLT RegTy;
4469 
4470   if (IsD16 && ST.hasUnpackedD16VMem()) {
4471     RoundedTy =
4472         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4473     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4474     RegTy = S32;
4475   } else {
4476     unsigned EltSize = EltTy.getSizeInBits();
4477     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4478     unsigned RoundedSize = 32 * RoundedElts;
4479     RoundedTy = LLT::scalarOrVector(
4480         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4481     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4482     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4483   }
4484 
4485   // The return type does not need adjustment.
4486   // TODO: Should we change s16 case to s32 or <2 x s16>?
4487   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4488     return true;
4489 
4490   Register Dst1Reg;
4491 
4492   // Insert after the instruction.
4493   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4494 
4495   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4496   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4497   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4498   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4499 
4500   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4501 
4502   MI.getOperand(0).setReg(NewResultReg);
4503 
4504   // In the IR, TFE is supposed to be used with a 2 element struct return
4505   // type. The intruction really returns these two values in one contiguous
4506   // register, with one additional dword beyond the loaded data. Rewrite the
4507   // return type to use a single register result.
4508 
4509   if (IsTFE) {
4510     Dst1Reg = MI.getOperand(1).getReg();
4511     if (MRI->getType(Dst1Reg) != S32)
4512       return false;
4513 
4514     // TODO: Make sure the TFE operand bit is set.
4515     MI.RemoveOperand(1);
4516 
4517     // Handle the easy case that requires no repack instructions.
4518     if (Ty == S32) {
4519       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4520       return true;
4521     }
4522   }
4523 
4524   // Now figure out how to copy the new result register back into the old
4525   // result.
4526   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4527 
4528   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4529 
4530   if (ResultNumRegs == 1) {
4531     assert(!IsTFE);
4532     ResultRegs[0] = NewResultReg;
4533   } else {
4534     // We have to repack into a new vector of some kind.
4535     for (int I = 0; I != NumDataRegs; ++I)
4536       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4537     B.buildUnmerge(ResultRegs, NewResultReg);
4538 
4539     // Drop the final TFE element to get the data part. The TFE result is
4540     // directly written to the right place already.
4541     if (IsTFE)
4542       ResultRegs.resize(NumDataRegs);
4543   }
4544 
4545   // For an s16 scalar result, we form an s32 result with a truncate regardless
4546   // of packed vs. unpacked.
4547   if (IsD16 && !Ty.isVector()) {
4548     B.buildTrunc(DstReg, ResultRegs[0]);
4549     return true;
4550   }
4551 
4552   // Avoid a build/concat_vector of 1 entry.
4553   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4554     B.buildBitcast(DstReg, ResultRegs[0]);
4555     return true;
4556   }
4557 
4558   assert(Ty.isVector());
4559 
4560   if (IsD16) {
4561     // For packed D16 results with TFE enabled, all the data components are
4562     // S32. Cast back to the expected type.
4563     //
4564     // TODO: We don't really need to use load s32 elements. We would only need one
4565     // cast for the TFE result if a multiple of v2s16 was used.
4566     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4567       for (Register &Reg : ResultRegs)
4568         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4569     } else if (ST.hasUnpackedD16VMem()) {
4570       for (Register &Reg : ResultRegs)
4571         Reg = B.buildTrunc(S16, Reg).getReg(0);
4572     }
4573   }
4574 
4575   auto padWithUndef = [&](LLT Ty, int NumElts) {
4576     if (NumElts == 0)
4577       return;
4578     Register Undef = B.buildUndef(Ty).getReg(0);
4579     for (int I = 0; I != NumElts; ++I)
4580       ResultRegs.push_back(Undef);
4581   };
4582 
4583   // Pad out any elements eliminated due to the dmask.
4584   LLT ResTy = MRI->getType(ResultRegs[0]);
4585   if (!ResTy.isVector()) {
4586     padWithUndef(ResTy, NumElts - ResultRegs.size());
4587     B.buildBuildVector(DstReg, ResultRegs);
4588     return true;
4589   }
4590 
4591   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4592   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4593 
4594   // Deal with the one annoying legal case.
4595   const LLT V3S16 = LLT::fixed_vector(3, 16);
4596   if (Ty == V3S16) {
4597     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4598     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4599     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4600     return true;
4601   }
4602 
4603   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4604   B.buildConcatVectors(DstReg, ResultRegs);
4605   return true;
4606 }
4607 
4608 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4609   LegalizerHelper &Helper, MachineInstr &MI) const {
4610   MachineIRBuilder &B = Helper.MIRBuilder;
4611   GISelChangeObserver &Observer = Helper.Observer;
4612 
4613   Register Dst = MI.getOperand(0).getReg();
4614   LLT Ty = B.getMRI()->getType(Dst);
4615   unsigned Size = Ty.getSizeInBits();
4616   MachineFunction &MF = B.getMF();
4617 
4618   Observer.changingInstr(MI);
4619 
4620   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4621     Ty = getBitcastRegisterType(Ty);
4622     Helper.bitcastDst(MI, Ty, 0);
4623     Dst = MI.getOperand(0).getReg();
4624     B.setInsertPt(B.getMBB(), MI);
4625   }
4626 
4627   // FIXME: We don't really need this intermediate instruction. The intrinsic
4628   // should be fixed to have a memory operand. Since it's readnone, we're not
4629   // allowed to add one.
4630   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4631   MI.RemoveOperand(1); // Remove intrinsic ID
4632 
4633   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4634   // TODO: Should this use datalayout alignment?
4635   const unsigned MemSize = (Size + 7) / 8;
4636   const Align MemAlign(4);
4637   MachineMemOperand *MMO = MF.getMachineMemOperand(
4638       MachinePointerInfo(),
4639       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4640           MachineMemOperand::MOInvariant,
4641       MemSize, MemAlign);
4642   MI.addMemOperand(MF, MMO);
4643 
4644   // There are no 96-bit result scalar loads, but widening to 128-bit should
4645   // always be legal. We may need to restore this to a 96-bit result if it turns
4646   // out this needs to be converted to a vector load during RegBankSelect.
4647   if (!isPowerOf2_32(Size)) {
4648     if (Ty.isVector())
4649       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4650     else
4651       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4652   }
4653 
4654   Observer.changedInstr(MI);
4655   return true;
4656 }
4657 
4658 // TODO: Move to selection
4659 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4660                                                 MachineRegisterInfo &MRI,
4661                                                 MachineIRBuilder &B) const {
4662   if (!ST.isTrapHandlerEnabled() ||
4663       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4664     return legalizeTrapEndpgm(MI, MRI, B);
4665 
4666   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4667     switch (*HsaAbiVer) {
4668     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4669     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4670       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4671     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4672       return ST.supportsGetDoorbellID() ?
4673           legalizeTrapHsa(MI, MRI, B) :
4674           legalizeTrapHsaQueuePtr(MI, MRI, B);
4675     }
4676   }
4677 
4678   llvm_unreachable("Unknown trap handler");
4679 }
4680 
4681 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4682     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4683   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4684   MI.eraseFromParent();
4685   return true;
4686 }
4687 
4688 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4689     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4690   // Pass queue pointer to trap handler as input, and insert trap instruction
4691   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4692   Register LiveIn =
4693     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4694   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4695     return false;
4696 
4697   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4698   B.buildCopy(SGPR01, LiveIn);
4699   B.buildInstr(AMDGPU::S_TRAP)
4700       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4701       .addReg(SGPR01, RegState::Implicit);
4702 
4703   MI.eraseFromParent();
4704   return true;
4705 }
4706 
4707 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4708     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4709   B.buildInstr(AMDGPU::S_TRAP)
4710       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4711   MI.eraseFromParent();
4712   return true;
4713 }
4714 
4715 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4716     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4717   // Is non-HSA path or trap-handler disabled? then, report a warning
4718   // accordingly
4719   if (!ST.isTrapHandlerEnabled() ||
4720       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4721     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4722                                      "debugtrap handler not supported",
4723                                      MI.getDebugLoc(), DS_Warning);
4724     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4725     Ctx.diagnose(NoTrap);
4726   } else {
4727     // Insert debug-trap instruction
4728     B.buildInstr(AMDGPU::S_TRAP)
4729         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4730   }
4731 
4732   MI.eraseFromParent();
4733   return true;
4734 }
4735 
4736 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4737                                                MachineIRBuilder &B) const {
4738   MachineRegisterInfo &MRI = *B.getMRI();
4739   const LLT S16 = LLT::scalar(16);
4740   const LLT S32 = LLT::scalar(32);
4741 
4742   Register DstReg = MI.getOperand(0).getReg();
4743   Register NodePtr = MI.getOperand(2).getReg();
4744   Register RayExtent = MI.getOperand(3).getReg();
4745   Register RayOrigin = MI.getOperand(4).getReg();
4746   Register RayDir = MI.getOperand(5).getReg();
4747   Register RayInvDir = MI.getOperand(6).getReg();
4748   Register TDescr = MI.getOperand(7).getReg();
4749 
4750   if (!ST.hasGFX10_AEncoding()) {
4751     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4752                                         "intrinsic not supported on subtarget",
4753                                         MI.getDebugLoc());
4754     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4755     return false;
4756   }
4757 
4758   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4759   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4760   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4761                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4762                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4763                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4764 
4765   SmallVector<Register, 12> Ops;
4766   if (Is64) {
4767     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4768     Ops.push_back(Unmerge.getReg(0));
4769     Ops.push_back(Unmerge.getReg(1));
4770   } else {
4771     Ops.push_back(NodePtr);
4772   }
4773   Ops.push_back(RayExtent);
4774 
4775   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4776     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4777     Ops.push_back(Unmerge.getReg(0));
4778     Ops.push_back(Unmerge.getReg(1));
4779     Ops.push_back(Unmerge.getReg(2));
4780   };
4781 
4782   packLanes(RayOrigin);
4783   if (IsA16) {
4784     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4785     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4786     Register R1 = MRI.createGenericVirtualRegister(S32);
4787     Register R2 = MRI.createGenericVirtualRegister(S32);
4788     Register R3 = MRI.createGenericVirtualRegister(S32);
4789     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4790     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4791     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4792     Ops.push_back(R1);
4793     Ops.push_back(R2);
4794     Ops.push_back(R3);
4795   } else {
4796     packLanes(RayDir);
4797     packLanes(RayInvDir);
4798   }
4799 
4800   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4801     .addDef(DstReg)
4802     .addImm(Opcode);
4803 
4804   for (Register R : Ops) {
4805     MIB.addUse(R);
4806   }
4807 
4808   MIB.addUse(TDescr)
4809      .addImm(IsA16 ? 1 : 0)
4810      .cloneMemRefs(MI);
4811 
4812   MI.eraseFromParent();
4813   return true;
4814 }
4815 
4816 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4817                                             MachineInstr &MI) const {
4818   MachineIRBuilder &B = Helper.MIRBuilder;
4819   MachineRegisterInfo &MRI = *B.getMRI();
4820 
4821   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4822   auto IntrID = MI.getIntrinsicID();
4823   switch (IntrID) {
4824   case Intrinsic::amdgcn_if:
4825   case Intrinsic::amdgcn_else: {
4826     MachineInstr *Br = nullptr;
4827     MachineBasicBlock *UncondBrTarget = nullptr;
4828     bool Negated = false;
4829     if (MachineInstr *BrCond =
4830             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4831       const SIRegisterInfo *TRI
4832         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4833 
4834       Register Def = MI.getOperand(1).getReg();
4835       Register Use = MI.getOperand(3).getReg();
4836 
4837       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4838 
4839       if (Negated)
4840         std::swap(CondBrTarget, UncondBrTarget);
4841 
4842       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4843       if (IntrID == Intrinsic::amdgcn_if) {
4844         B.buildInstr(AMDGPU::SI_IF)
4845           .addDef(Def)
4846           .addUse(Use)
4847           .addMBB(UncondBrTarget);
4848       } else {
4849         B.buildInstr(AMDGPU::SI_ELSE)
4850             .addDef(Def)
4851             .addUse(Use)
4852             .addMBB(UncondBrTarget);
4853       }
4854 
4855       if (Br) {
4856         Br->getOperand(0).setMBB(CondBrTarget);
4857       } else {
4858         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4859         // since we're swapping branch targets it needs to be reinserted.
4860         // FIXME: IRTranslator should probably not do this
4861         B.buildBr(*CondBrTarget);
4862       }
4863 
4864       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4865       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4866       MI.eraseFromParent();
4867       BrCond->eraseFromParent();
4868       return true;
4869     }
4870 
4871     return false;
4872   }
4873   case Intrinsic::amdgcn_loop: {
4874     MachineInstr *Br = nullptr;
4875     MachineBasicBlock *UncondBrTarget = nullptr;
4876     bool Negated = false;
4877     if (MachineInstr *BrCond =
4878             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4879       const SIRegisterInfo *TRI
4880         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4881 
4882       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4883       Register Reg = MI.getOperand(2).getReg();
4884 
4885       if (Negated)
4886         std::swap(CondBrTarget, UncondBrTarget);
4887 
4888       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4889       B.buildInstr(AMDGPU::SI_LOOP)
4890         .addUse(Reg)
4891         .addMBB(UncondBrTarget);
4892 
4893       if (Br)
4894         Br->getOperand(0).setMBB(CondBrTarget);
4895       else
4896         B.buildBr(*CondBrTarget);
4897 
4898       MI.eraseFromParent();
4899       BrCond->eraseFromParent();
4900       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4901       return true;
4902     }
4903 
4904     return false;
4905   }
4906   case Intrinsic::amdgcn_kernarg_segment_ptr:
4907     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4908       // This only makes sense to call in a kernel, so just lower to null.
4909       B.buildConstant(MI.getOperand(0).getReg(), 0);
4910       MI.eraseFromParent();
4911       return true;
4912     }
4913 
4914     return legalizePreloadedArgIntrin(
4915       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4916   case Intrinsic::amdgcn_implicitarg_ptr:
4917     return legalizeImplicitArgPtr(MI, MRI, B);
4918   case Intrinsic::amdgcn_workitem_id_x:
4919     return legalizePreloadedArgIntrin(MI, MRI, B,
4920                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4921   case Intrinsic::amdgcn_workitem_id_y:
4922     return legalizePreloadedArgIntrin(MI, MRI, B,
4923                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4924   case Intrinsic::amdgcn_workitem_id_z:
4925     return legalizePreloadedArgIntrin(MI, MRI, B,
4926                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4927   case Intrinsic::amdgcn_workgroup_id_x:
4928     return legalizePreloadedArgIntrin(MI, MRI, B,
4929                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4930   case Intrinsic::amdgcn_workgroup_id_y:
4931     return legalizePreloadedArgIntrin(MI, MRI, B,
4932                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4933   case Intrinsic::amdgcn_workgroup_id_z:
4934     return legalizePreloadedArgIntrin(MI, MRI, B,
4935                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4936   case Intrinsic::amdgcn_dispatch_ptr:
4937     return legalizePreloadedArgIntrin(MI, MRI, B,
4938                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4939   case Intrinsic::amdgcn_queue_ptr:
4940     return legalizePreloadedArgIntrin(MI, MRI, B,
4941                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4942   case Intrinsic::amdgcn_implicit_buffer_ptr:
4943     return legalizePreloadedArgIntrin(
4944       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4945   case Intrinsic::amdgcn_dispatch_id:
4946     return legalizePreloadedArgIntrin(MI, MRI, B,
4947                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4948   case Intrinsic::amdgcn_fdiv_fast:
4949     return legalizeFDIVFastIntrin(MI, MRI, B);
4950   case Intrinsic::amdgcn_is_shared:
4951     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4952   case Intrinsic::amdgcn_is_private:
4953     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4954   case Intrinsic::amdgcn_wavefrontsize: {
4955     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4956     MI.eraseFromParent();
4957     return true;
4958   }
4959   case Intrinsic::amdgcn_s_buffer_load:
4960     return legalizeSBufferLoad(Helper, MI);
4961   case Intrinsic::amdgcn_raw_buffer_store:
4962   case Intrinsic::amdgcn_struct_buffer_store:
4963     return legalizeBufferStore(MI, MRI, B, false, false);
4964   case Intrinsic::amdgcn_raw_buffer_store_format:
4965   case Intrinsic::amdgcn_struct_buffer_store_format:
4966     return legalizeBufferStore(MI, MRI, B, false, true);
4967   case Intrinsic::amdgcn_raw_tbuffer_store:
4968   case Intrinsic::amdgcn_struct_tbuffer_store:
4969     return legalizeBufferStore(MI, MRI, B, true, true);
4970   case Intrinsic::amdgcn_raw_buffer_load:
4971   case Intrinsic::amdgcn_struct_buffer_load:
4972     return legalizeBufferLoad(MI, MRI, B, false, false);
4973   case Intrinsic::amdgcn_raw_buffer_load_format:
4974   case Intrinsic::amdgcn_struct_buffer_load_format:
4975     return legalizeBufferLoad(MI, MRI, B, true, false);
4976   case Intrinsic::amdgcn_raw_tbuffer_load:
4977   case Intrinsic::amdgcn_struct_tbuffer_load:
4978     return legalizeBufferLoad(MI, MRI, B, true, true);
4979   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4980   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4981   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4982   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4983   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4984   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4985   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4986   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4987   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4988   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4989   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4990   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4991   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4992   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4993   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4994   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4995   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4996   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4997   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4998   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4999   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5000   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5001   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5002   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5003   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5004   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5005   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5006   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5007   case Intrinsic::amdgcn_buffer_atomic_fadd:
5008   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5009   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5010   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5011   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5012     return legalizeBufferAtomic(MI, B, IntrID);
5013   case Intrinsic::amdgcn_atomic_inc:
5014     return legalizeAtomicIncDec(MI, B, true);
5015   case Intrinsic::amdgcn_atomic_dec:
5016     return legalizeAtomicIncDec(MI, B, false);
5017   case Intrinsic::trap:
5018     return legalizeTrapIntrinsic(MI, MRI, B);
5019   case Intrinsic::debugtrap:
5020     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5021   case Intrinsic::amdgcn_rsq_clamp:
5022     return legalizeRsqClampIntrinsic(MI, MRI, B);
5023   case Intrinsic::amdgcn_ds_fadd:
5024   case Intrinsic::amdgcn_ds_fmin:
5025   case Intrinsic::amdgcn_ds_fmax:
5026     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5027   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5028     return legalizeBVHIntrinsic(MI, B);
5029   default: {
5030     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5031             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5032       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5033     return true;
5034   }
5035   }
5036 
5037   return true;
5038 }
5039