1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             unsigned AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .clampScalar(0, S16, S32)
536       .clampMaxNumElements(0, S16, 2)
537       .scalarize(0)
538       .widenScalarToNextPow2(0, 32);
539 
540     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541       .legalFor({S32, S16, V2S16}) // Clamp modifier
542       .minScalarOrElt(0, S16)
543       .clampMaxNumElements(0, S16, 2)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 32)
546       .lower();
547   } else if (ST.has16BitInsts()) {
548     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549       .legalFor({S32, S16})
550       .clampScalar(0, S16, S32)
551       .scalarize(0)
552       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554     // Technically the saturating operations require clamp bit support, but this
555     // was introduced at the same time as 16-bit operations.
556     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557       .legalFor({S32, S16}) // Clamp modifier
558       .minScalar(0, S16)
559       .scalarize(0)
560       .widenScalarToNextPow2(0, 16)
561       .lower();
562 
563     // We're just lowering this, but it helps get a better result to try to
564     // coerce to the desired type first.
565     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566       .minScalar(0, S16)
567       .scalarize(0)
568       .lower();
569   } else {
570     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571       .legalFor({S32})
572       .clampScalar(0, S32, S32)
573       .scalarize(0);
574 
575     if (ST.hasIntClamp()) {
576       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577         .legalFor({S32}) // Clamp modifier.
578         .scalarize(0)
579         .minScalarOrElt(0, S32)
580         .lower();
581     } else {
582       // Clamp bit support was added in VI, along with 16-bit operations.
583       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584         .minScalar(0, S32)
585         .scalarize(0)
586         .lower();
587     }
588 
589     // FIXME: DAG expansion gets better results. The widening uses the smaller
590     // range values and goes for the min/max lowering directly.
591     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592       .minScalar(0, S32)
593       .scalarize(0)
594       .lower();
595   }
596 
597   getActionDefinitionsBuilder(
598       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599       .customFor({S32, S64})
600       .clampScalar(0, S32, S64)
601       .widenScalarToNextPow2(0, 32)
602       .scalarize(0);
603 
604   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605                    .legalFor({S32})
606                    .maxScalarOrElt(0, S32);
607 
608   if (ST.hasVOP3PInsts()) {
609     Mulh
610       .clampMaxNumElements(0, S8, 2)
611       .lowerFor({V2S8});
612   }
613 
614   Mulh
615     .scalarize(0)
616     .lower();
617 
618   // Report legal for any types we can handle anywhere. For the cases only legal
619   // on the SALU, RegBankSelect will be able to re-legalize.
620   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622     .clampScalar(0, S32, S64)
623     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625     .widenScalarToNextPow2(0)
626     .scalarize(0);
627 
628   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630     .legalFor({{S32, S1}, {S32, S32}})
631     .minScalar(0, S32)
632     // TODO: .scalarize(0)
633     .lower();
634 
635   getActionDefinitionsBuilder(G_BITCAST)
636     // Don't worry about the size constraint.
637     .legalIf(all(isRegisterType(0), isRegisterType(1)))
638     .lower();
639 
640 
641   getActionDefinitionsBuilder(G_CONSTANT)
642     .legalFor({S1, S32, S64, S16, GlobalPtr,
643                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644     .legalIf(isPointer(0))
645     .clampScalar(0, S32, S64)
646     .widenScalarToNextPow2(0);
647 
648   getActionDefinitionsBuilder(G_FCONSTANT)
649     .legalFor({S32, S64, S16})
650     .clampScalar(0, S16, S64);
651 
652   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653       .legalIf(isRegisterType(0))
654       // s1 and s16 are special cases because they have legal operations on
655       // them, but don't really occupy registers in the normal way.
656       .legalFor({S1, S16})
657       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658       .clampScalarOrElt(0, S32, MaxScalar)
659       .widenScalarToNextPow2(0, 32)
660       .clampMaxNumElements(0, S32, 16);
661 
662   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664   // If the amount is divergent, we have to do a wave reduction to get the
665   // maximum value, so this is expanded during RegBankSelect.
666   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667     .legalFor({{PrivatePtr, S32}});
668 
669   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670     .customIf(typeIsNot(0, PrivatePtr));
671 
672   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674   auto &FPOpActions = getActionDefinitionsBuilder(
675     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676     .legalFor({S32, S64});
677   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678     .customFor({S32, S64});
679   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680     .customFor({S32, S64});
681 
682   if (ST.has16BitInsts()) {
683     if (ST.hasVOP3PInsts())
684       FPOpActions.legalFor({S16, V2S16});
685     else
686       FPOpActions.legalFor({S16});
687 
688     TrigActions.customFor({S16});
689     FDIVActions.customFor({S16});
690   }
691 
692   auto &MinNumMaxNum = getActionDefinitionsBuilder({
693       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695   if (ST.hasVOP3PInsts()) {
696     MinNumMaxNum.customFor(FPTypesPK16)
697       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698       .clampMaxNumElements(0, S16, 2)
699       .clampScalar(0, S16, S64)
700       .scalarize(0);
701   } else if (ST.has16BitInsts()) {
702     MinNumMaxNum.customFor(FPTypes16)
703       .clampScalar(0, S16, S64)
704       .scalarize(0);
705   } else {
706     MinNumMaxNum.customFor(FPTypesBase)
707       .clampScalar(0, S32, S64)
708       .scalarize(0);
709   }
710 
711   if (ST.hasVOP3PInsts())
712     FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714   FPOpActions
715     .scalarize(0)
716     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718   TrigActions
719     .scalarize(0)
720     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722   FDIVActions
723     .scalarize(0)
724     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726   getActionDefinitionsBuilder({G_FNEG, G_FABS})
727     .legalFor(FPTypesPK16)
728     .clampMaxNumElements(0, S16, 2)
729     .scalarize(0)
730     .clampScalar(0, S16, S64);
731 
732   if (ST.has16BitInsts()) {
733     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734       .legalFor({S32, S64, S16})
735       .scalarize(0)
736       .clampScalar(0, S16, S64);
737   } else {
738     getActionDefinitionsBuilder(G_FSQRT)
739       .legalFor({S32, S64})
740       .scalarize(0)
741       .clampScalar(0, S32, S64);
742 
743     if (ST.hasFractBug()) {
744       getActionDefinitionsBuilder(G_FFLOOR)
745         .customFor({S64})
746         .legalFor({S32, S64})
747         .scalarize(0)
748         .clampScalar(0, S32, S64);
749     } else {
750       getActionDefinitionsBuilder(G_FFLOOR)
751         .legalFor({S32, S64})
752         .scalarize(0)
753         .clampScalar(0, S32, S64);
754     }
755   }
756 
757   getActionDefinitionsBuilder(G_FPTRUNC)
758     .legalFor({{S32, S64}, {S16, S32}})
759     .scalarize(0)
760     .lower();
761 
762   getActionDefinitionsBuilder(G_FPEXT)
763     .legalFor({{S64, S32}, {S32, S16}})
764     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765     .scalarize(0);
766 
767   getActionDefinitionsBuilder(G_FSUB)
768       // Use actual fsub instruction
769       .legalFor({S32})
770       // Must use fadd + fneg
771       .lowerFor({S64, S16, V2S16})
772       .scalarize(0)
773       .clampScalar(0, S32, S64);
774 
775   // Whether this is legal depends on the floating point mode for the function.
776   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778     FMad.customFor({S32, S16});
779   else if (ST.hasMadMacF32Insts())
780     FMad.customFor({S32});
781   else if (ST.hasMadF16())
782     FMad.customFor({S16});
783   FMad.scalarize(0)
784       .lower();
785 
786   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787   if (ST.has16BitInsts()) {
788     FRem.customFor({S16, S32, S64});
789   } else {
790     FRem.minScalar(0, S32)
791         .customFor({S32, S64});
792   }
793   FRem.scalarize(0);
794 
795   // TODO: Do we need to clamp maximum bitwidth?
796   getActionDefinitionsBuilder(G_TRUNC)
797     .legalIf(isScalar(0))
798     .legalFor({{V2S16, V2S32}})
799     .clampMaxNumElements(0, S16, 2)
800     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801     // situations (like an invalid implicit use), we don't want to infinite loop
802     // in the legalizer.
803     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804     .alwaysLegal();
805 
806   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808                {S32, S1}, {S64, S1}, {S16, S1}})
809     .scalarize(0)
810     .clampScalar(0, S32, S64)
811     .widenScalarToNextPow2(1, 32);
812 
813   // TODO: Split s1->s64 during regbankselect for VALU.
814   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816     .lowerFor({{S32, S64}})
817     .lowerIf(typeIs(1, S1))
818     .customFor({{S64, S64}});
819   if (ST.has16BitInsts())
820     IToFP.legalFor({{S16, S16}});
821   IToFP.clampScalar(1, S32, S64)
822        .minScalar(0, S32)
823        .scalarize(0)
824        .widenScalarToNextPow2(1);
825 
826   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
827     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828     .customFor({{S64, S32}, {S64, S64}})
829     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
830   if (ST.has16BitInsts())
831     FPToI.legalFor({{S16, S16}});
832   else
833     FPToI.minScalar(1, S32);
834 
835   FPToI.minScalar(0, S32)
836        .widenScalarToNextPow2(0, 32)
837        .scalarize(0)
838        .lower();
839 
840   // Lower roundeven into G_FRINT
841   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842     .scalarize(0)
843     .lower();
844 
845   if (ST.has16BitInsts()) {
846     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847       .legalFor({S16, S32, S64})
848       .clampScalar(0, S16, S64)
849       .scalarize(0);
850   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
851     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852       .legalFor({S32, S64})
853       .clampScalar(0, S32, S64)
854       .scalarize(0);
855   } else {
856     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
857       .legalFor({S32})
858       .customFor({S64})
859       .clampScalar(0, S32, S64)
860       .scalarize(0);
861   }
862 
863   getActionDefinitionsBuilder(G_PTR_ADD)
864     .legalIf(all(isPointer(0), sameSize(0, 1)))
865     .scalarize(0)
866     .scalarSameSizeAs(1, 0);
867 
868   getActionDefinitionsBuilder(G_PTRMASK)
869     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870     .scalarSameSizeAs(1, 0)
871     .scalarize(0);
872 
873   auto &CmpBuilder =
874     getActionDefinitionsBuilder(G_ICMP)
875     // The compare output type differs based on the register bank of the output,
876     // so make both s1 and s32 legal.
877     //
878     // Scalar compares producing output in scc will be promoted to s32, as that
879     // is the allocatable register type that will be needed for the copy from
880     // scc. This will be promoted during RegBankSelect, and we assume something
881     // before that won't try to use s32 result types.
882     //
883     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884     // bank.
885     .legalForCartesianProduct(
886       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887     .legalForCartesianProduct(
888       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
889   if (ST.has16BitInsts()) {
890     CmpBuilder.legalFor({{S1, S16}});
891   }
892 
893   CmpBuilder
894     .widenScalarToNextPow2(1)
895     .clampScalar(1, S32, S64)
896     .scalarize(0)
897     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898 
899   getActionDefinitionsBuilder(G_FCMP)
900     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0);
904 
905   // FIXME: fpow has a selection pattern that should move to custom lowering.
906   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
907   if (ST.has16BitInsts())
908     Exp2Ops.legalFor({S32, S16});
909   else
910     Exp2Ops.legalFor({S32});
911   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
912   Exp2Ops.scalarize(0);
913 
914   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
915   if (ST.has16BitInsts())
916     ExpOps.customFor({{S32}, {S16}});
917   else
918     ExpOps.customFor({S32});
919   ExpOps.clampScalar(0, MinScalarFPTy, S32)
920         .scalarize(0);
921 
922   getActionDefinitionsBuilder(G_FPOWI)
923     .clampScalar(0, MinScalarFPTy, S32)
924     .lower();
925 
926   // The 64-bit versions produce 32-bit results, but only on the SALU.
927   getActionDefinitionsBuilder(G_CTPOP)
928     .legalFor({{S32, S32}, {S32, S64}})
929     .clampScalar(0, S32, S32)
930     .clampScalar(1, S32, S64)
931     .scalarize(0)
932     .widenScalarToNextPow2(0, 32)
933     .widenScalarToNextPow2(1, 32);
934 
935   // The hardware instructions return a different result on 0 than the generic
936   // instructions expect. The hardware produces -1, but these produce the
937   // bitwidth.
938   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
939     .scalarize(0)
940     .clampScalar(0, S32, S32)
941     .clampScalar(1, S32, S64)
942     .widenScalarToNextPow2(0, 32)
943     .widenScalarToNextPow2(1, 32)
944     .lower();
945 
946   // The 64-bit versions produce 32-bit results, but only on the SALU.
947   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
948     .legalFor({{S32, S32}, {S32, S64}})
949     .clampScalar(0, S32, S32)
950     .clampScalar(1, S32, S64)
951     .scalarize(0)
952     .widenScalarToNextPow2(0, 32)
953     .widenScalarToNextPow2(1, 32);
954 
955   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956   // RegBankSelect.
957   getActionDefinitionsBuilder(G_BITREVERSE)
958     .legalFor({S32, S64})
959     .clampScalar(0, S32, S64)
960     .scalarize(0)
961     .widenScalarToNextPow2(0);
962 
963   if (ST.has16BitInsts()) {
964     getActionDefinitionsBuilder(G_BSWAP)
965       .legalFor({S16, S32, V2S16})
966       .clampMaxNumElements(0, S16, 2)
967       // FIXME: Fixing non-power-of-2 before clamp is workaround for
968       // narrowScalar limitation.
969       .widenScalarToNextPow2(0)
970       .clampScalar(0, S16, S32)
971       .scalarize(0);
972 
973     if (ST.hasVOP3PInsts()) {
974       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975         .legalFor({S32, S16, V2S16})
976         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
977         .clampMaxNumElements(0, S16, 2)
978         .minScalar(0, S16)
979         .widenScalarToNextPow2(0)
980         .scalarize(0)
981         .lower();
982     } else {
983       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
984         .legalFor({S32, S16})
985         .widenScalarToNextPow2(0)
986         .minScalar(0, S16)
987         .scalarize(0)
988         .lower();
989     }
990   } else {
991     // TODO: Should have same legality without v_perm_b32
992     getActionDefinitionsBuilder(G_BSWAP)
993       .legalFor({S32})
994       .lowerIf(scalarNarrowerThan(0, 32))
995       // FIXME: Fixing non-power-of-2 before clamp is workaround for
996       // narrowScalar limitation.
997       .widenScalarToNextPow2(0)
998       .maxScalar(0, S32)
999       .scalarize(0)
1000       .lower();
1001 
1002     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1003       .legalFor({S32})
1004       .minScalar(0, S32)
1005       .widenScalarToNextPow2(0)
1006       .scalarize(0)
1007       .lower();
1008   }
1009 
1010   getActionDefinitionsBuilder(G_INTTOPTR)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(1, 0),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020       })
1021     .narrowScalarIf(largerThan(1, 0),
1022       [](const LegalityQuery &Query) {
1023         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1024       });
1025 
1026   getActionDefinitionsBuilder(G_PTRTOINT)
1027     // List the common cases
1028     .legalForCartesianProduct(AddrSpaces64, {S64})
1029     .legalForCartesianProduct(AddrSpaces32, {S32})
1030     .scalarize(0)
1031     // Accept any address space as long as the size matches
1032     .legalIf(sameSize(0, 1))
1033     .widenScalarIf(smallerThan(0, 1),
1034       [](const LegalityQuery &Query) {
1035         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036       })
1037     .narrowScalarIf(
1038       largerThan(0, 1),
1039       [](const LegalityQuery &Query) {
1040         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1041       });
1042 
1043   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1044     .scalarize(0)
1045     .custom();
1046 
1047   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1048                                     bool IsLoad) -> bool {
1049     const LLT DstTy = Query.Types[0];
1050 
1051     // Split vector extloads.
1052     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054 
1055     if (MemSize < DstTy.getSizeInBits())
1056       MemSize = std::max(MemSize, AlignBits);
1057 
1058     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1059       return true;
1060 
1061     const LLT PtrTy = Query.Types[1];
1062     unsigned AS = PtrTy.getAddressSpace();
1063     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1064       return true;
1065 
1066     // Catch weird sized loads that don't evenly divide into the access sizes
1067     // TODO: May be able to widen depending on alignment etc.
1068     unsigned NumRegs = (MemSize + 31) / 32;
1069     if (NumRegs == 3) {
1070       if (!ST.hasDwordx3LoadStores())
1071         return true;
1072     } else {
1073       // If the alignment allows, these should have been widened.
1074       if (!isPowerOf2_32(NumRegs))
1075         return true;
1076     }
1077 
1078     if (AlignBits < MemSize) {
1079       const SITargetLowering *TLI = ST.getTargetLowering();
1080       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081                                                       Align(AlignBits / 8));
1082     }
1083 
1084     return false;
1085   };
1086 
1087   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 
1091   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092   // LDS
1093   // TODO: Unsupported flat for SI.
1094 
1095   for (unsigned Op : {G_LOAD, G_STORE}) {
1096     const bool IsStore = Op == G_STORE;
1097 
1098     auto &Actions = getActionDefinitionsBuilder(Op);
1099     // Explicitly list some common cases.
1100     // TODO: Does this help compile time at all?
1101     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104                                       {S64, GlobalPtr, S64, GlobalAlign32},
1105                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107                                       {S32, GlobalPtr, S8, GlobalAlign8},
1108                                       {S32, GlobalPtr, S16, GlobalAlign16},
1109 
1110                                       {S32, LocalPtr, S32, 32},
1111                                       {S64, LocalPtr, S64, 32},
1112                                       {V2S32, LocalPtr, V2S32, 32},
1113                                       {S32, LocalPtr, S8, 8},
1114                                       {S32, LocalPtr, S16, 16},
1115                                       {V2S16, LocalPtr, S32, 32},
1116 
1117                                       {S32, PrivatePtr, S32, 32},
1118                                       {S32, PrivatePtr, S8, 8},
1119                                       {S32, PrivatePtr, S16, 16},
1120                                       {V2S16, PrivatePtr, S32, 32},
1121 
1122                                       {S32, ConstantPtr, S32, GlobalAlign32},
1123                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125                                       {S64, ConstantPtr, S64, GlobalAlign32},
1126                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1127     Actions.legalIf(
1128       [=](const LegalityQuery &Query) -> bool {
1129         return isLoadStoreLegal(ST, Query);
1130       });
1131 
1132     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1133     // 64-bits.
1134     //
1135     // TODO: Should generalize bitcast action into coerce, which will also cover
1136     // inserting addrspacecasts.
1137     Actions.customIf(typeIs(1, Constant32Ptr));
1138 
1139     // Turn any illegal element vectors into something easier to deal
1140     // with. These will ultimately produce 32-bit scalar shifts to extract the
1141     // parts anyway.
1142     //
1143     // For odd 16-bit element vectors, prefer to split those into pieces with
1144     // 16-bit vector parts.
1145     Actions.bitcastIf(
1146       [=](const LegalityQuery &Query) -> bool {
1147         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148                                           Query.MMODescrs[0].MemoryTy);
1149       }, bitcastToRegisterType(0));
1150 
1151     if (!IsStore) {
1152       // Widen suitably aligned loads by loading extra bytes. The standard
1153       // legalization actions can't properly express widening memory operands.
1154       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155         return shouldWidenLoad(ST, Query, G_LOAD);
1156       });
1157     }
1158 
1159     // FIXME: load/store narrowing should be moved to lower action
1160     Actions
1161         .narrowScalarIf(
1162             [=](const LegalityQuery &Query) -> bool {
1163               return !Query.Types[0].isVector() &&
1164                      needToSplitMemOp(Query, Op == G_LOAD);
1165             },
1166             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1167               const LLT DstTy = Query.Types[0];
1168               const LLT PtrTy = Query.Types[1];
1169 
1170               const unsigned DstSize = DstTy.getSizeInBits();
1171               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172 
1173               // Split extloads.
1174               if (DstSize > MemSize)
1175                 return std::make_pair(0, LLT::scalar(MemSize));
1176 
1177               if (!isPowerOf2_32(DstSize)) {
1178                 // We're probably decomposing an odd sized store. Try to split
1179                 // to the widest type. TODO: Account for alignment. As-is it
1180                 // should be OK, since the new parts will be further legalized.
1181                 unsigned FloorSize = PowerOf2Floor(DstSize);
1182                 return std::make_pair(0, LLT::scalar(FloorSize));
1183               }
1184 
1185               if (DstSize > 32 && (DstSize % 32 != 0)) {
1186                 // FIXME: Need a way to specify non-extload of larger size if
1187                 // suitably aligned.
1188                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1189               }
1190 
1191               unsigned MaxSize = maxSizeForAddrSpace(ST,
1192                                                      PtrTy.getAddressSpace(),
1193                                                      Op == G_LOAD);
1194               if (MemSize > MaxSize)
1195                 return std::make_pair(0, LLT::scalar(MaxSize));
1196 
1197               unsigned Align = Query.MMODescrs[0].AlignInBits;
1198               return std::make_pair(0, LLT::scalar(Align));
1199             })
1200         .fewerElementsIf(
1201             [=](const LegalityQuery &Query) -> bool {
1202               return Query.Types[0].isVector() &&
1203                      needToSplitMemOp(Query, Op == G_LOAD);
1204             },
1205             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1206               const LLT DstTy = Query.Types[0];
1207               const LLT PtrTy = Query.Types[1];
1208 
1209               LLT EltTy = DstTy.getElementType();
1210               unsigned MaxSize = maxSizeForAddrSpace(ST,
1211                                                      PtrTy.getAddressSpace(),
1212                                                      Op == G_LOAD);
1213 
1214               // FIXME: Handle widened to power of 2 results better. This ends
1215               // up scalarizing.
1216               // FIXME: 3 element stores scalarized on SI
1217 
1218               // Split if it's too large for the address space.
1219               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220               if (MemSize > MaxSize) {
1221                 unsigned NumElts = DstTy.getNumElements();
1222                 unsigned EltSize = EltTy.getSizeInBits();
1223 
1224                 if (MaxSize % EltSize == 0) {
1225                   return std::make_pair(
1226                       0, LLT::scalarOrVector(
1227                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1228                 }
1229 
1230                 unsigned NumPieces = MemSize / MaxSize;
1231 
1232                 // FIXME: Refine when odd breakdowns handled
1233                 // The scalars will need to be re-legalized.
1234                 if (NumPieces == 1 || NumPieces >= NumElts ||
1235                     NumElts % NumPieces != 0)
1236                   return std::make_pair(0, EltTy);
1237 
1238                 return std::make_pair(
1239                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1240               }
1241 
1242               // FIXME: We could probably handle weird extending loads better.
1243               if (DstTy.getSizeInBits() > MemSize)
1244                 return std::make_pair(0, EltTy);
1245 
1246               unsigned EltSize = EltTy.getSizeInBits();
1247               unsigned DstSize = DstTy.getSizeInBits();
1248               if (!isPowerOf2_32(DstSize)) {
1249                 // We're probably decomposing an odd sized store. Try to split
1250                 // to the widest type. TODO: Account for alignment. As-is it
1251                 // should be OK, since the new parts will be further legalized.
1252                 unsigned FloorSize = PowerOf2Floor(DstSize);
1253                 return std::make_pair(
1254                     0, LLT::scalarOrVector(
1255                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1256               }
1257 
1258               // Need to split because of alignment.
1259               unsigned Align = Query.MMODescrs[0].AlignInBits;
1260               if (EltSize > Align &&
1261                   (EltSize / Align < DstTy.getNumElements())) {
1262                 return std::make_pair(
1263                     0, LLT::fixed_vector(EltSize / Align, EltTy));
1264               }
1265 
1266               // May need relegalization for the scalars.
1267               return std::make_pair(0, EltTy);
1268             })
1269     .lowerIfMemSizeNotPow2()
1270     .minScalar(0, S32)
1271     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1272     .widenScalarToNextPow2(0)
1273     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1274     .lower();
1275   }
1276 
1277   // FIXME: Unaligned accesses not lowered.
1278   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1279                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1280                                                   {S32, GlobalPtr, S16, 2 * 8},
1281                                                   {S32, LocalPtr, S8, 8},
1282                                                   {S32, LocalPtr, S16, 16},
1283                                                   {S32, PrivatePtr, S8, 8},
1284                                                   {S32, PrivatePtr, S16, 16},
1285                                                   {S32, ConstantPtr, S8, 8},
1286                                                   {S32, ConstantPtr, S16, 2 * 8}})
1287                        .legalIf(
1288                          [=](const LegalityQuery &Query) -> bool {
1289                            return isLoadStoreLegal(ST, Query);
1290                          });
1291 
1292   if (ST.hasFlatAddressSpace()) {
1293     ExtLoads.legalForTypesWithMemDesc(
1294         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1295   }
1296 
1297   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1298   // 64-bits.
1299   //
1300   // TODO: Should generalize bitcast action into coerce, which will also cover
1301   // inserting addrspacecasts.
1302   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1303 
1304   ExtLoads.clampScalar(0, S32, S32)
1305           .widenScalarToNextPow2(0)
1306           .lower();
1307 
1308   auto &Atomics = getActionDefinitionsBuilder(
1309     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1310      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1311      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1312      G_ATOMICRMW_UMIN})
1313     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1314                {S64, GlobalPtr}, {S64, LocalPtr},
1315                {S32, RegionPtr}, {S64, RegionPtr}});
1316   if (ST.hasFlatAddressSpace()) {
1317     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1318   }
1319 
1320   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1321   if (ST.hasLDSFPAtomics()) {
1322     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1323     if (ST.hasGFX90AInsts())
1324       Atomic.legalFor({{S64, LocalPtr}});
1325   }
1326   if (ST.hasAtomicFaddInsts())
1327     Atomic.legalFor({{S32, GlobalPtr}});
1328 
1329   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1330   // demarshalling
1331   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1332     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1333                 {S32, FlatPtr}, {S64, FlatPtr}})
1334     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1335                {S32, RegionPtr}, {S64, RegionPtr}});
1336   // TODO: Pointer types, any 32-bit or 64-bit vector
1337 
1338   // Condition should be s32 for scalar, s1 for vector.
1339   getActionDefinitionsBuilder(G_SELECT)
1340       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1341                                  LocalPtr, FlatPtr, PrivatePtr,
1342                                  LLT::fixed_vector(2, LocalPtr),
1343                                  LLT::fixed_vector(2, PrivatePtr)},
1344                                 {S1, S32})
1345       .clampScalar(0, S16, S64)
1346       .scalarize(1)
1347       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1348       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1349       .clampMaxNumElements(0, S32, 2)
1350       .clampMaxNumElements(0, LocalPtr, 2)
1351       .clampMaxNumElements(0, PrivatePtr, 2)
1352       .scalarize(0)
1353       .widenScalarToNextPow2(0)
1354       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1355 
1356   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1357   // be more flexible with the shift amount type.
1358   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1359     .legalFor({{S32, S32}, {S64, S32}});
1360   if (ST.has16BitInsts()) {
1361     if (ST.hasVOP3PInsts()) {
1362       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1363             .clampMaxNumElements(0, S16, 2);
1364     } else
1365       Shifts.legalFor({{S16, S16}});
1366 
1367     // TODO: Support 16-bit shift amounts for all types
1368     Shifts.widenScalarIf(
1369       [=](const LegalityQuery &Query) {
1370         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1371         // 32-bit amount.
1372         const LLT ValTy = Query.Types[0];
1373         const LLT AmountTy = Query.Types[1];
1374         return ValTy.getSizeInBits() <= 16 &&
1375                AmountTy.getSizeInBits() < 16;
1376       }, changeTo(1, S16));
1377     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1378     Shifts.clampScalar(1, S32, S32);
1379     Shifts.clampScalar(0, S16, S64);
1380     Shifts.widenScalarToNextPow2(0, 16);
1381 
1382     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1383       .minScalar(0, S16)
1384       .scalarize(0)
1385       .lower();
1386   } else {
1387     // Make sure we legalize the shift amount type first, as the general
1388     // expansion for the shifted type will produce much worse code if it hasn't
1389     // been truncated already.
1390     Shifts.clampScalar(1, S32, S32);
1391     Shifts.clampScalar(0, S32, S64);
1392     Shifts.widenScalarToNextPow2(0, 32);
1393 
1394     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1395       .minScalar(0, S32)
1396       .scalarize(0)
1397       .lower();
1398   }
1399   Shifts.scalarize(0);
1400 
1401   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1402     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1403     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1404     unsigned IdxTypeIdx = 2;
1405 
1406     getActionDefinitionsBuilder(Op)
1407       .customIf([=](const LegalityQuery &Query) {
1408           const LLT EltTy = Query.Types[EltTypeIdx];
1409           const LLT VecTy = Query.Types[VecTypeIdx];
1410           const LLT IdxTy = Query.Types[IdxTypeIdx];
1411           const unsigned EltSize = EltTy.getSizeInBits();
1412           return (EltSize == 32 || EltSize == 64) &&
1413                   VecTy.getSizeInBits() % 32 == 0 &&
1414                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1415                   IdxTy.getSizeInBits() == 32;
1416         })
1417       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1418                  bitcastToVectorElement32(VecTypeIdx))
1419       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1420       .bitcastIf(
1421         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1422         [=](const LegalityQuery &Query) {
1423           // For > 64-bit element types, try to turn this into a 64-bit
1424           // element vector since we may be able to do better indexing
1425           // if this is scalar. If not, fall back to 32.
1426           const LLT EltTy = Query.Types[EltTypeIdx];
1427           const LLT VecTy = Query.Types[VecTypeIdx];
1428           const unsigned DstEltSize = EltTy.getSizeInBits();
1429           const unsigned VecSize = VecTy.getSizeInBits();
1430 
1431           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1432           return std::make_pair(
1433               VecTypeIdx,
1434               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1435         })
1436       .clampScalar(EltTypeIdx, S32, S64)
1437       .clampScalar(VecTypeIdx, S32, S64)
1438       .clampScalar(IdxTypeIdx, S32, S32)
1439       .clampMaxNumElements(VecTypeIdx, S32, 32)
1440       // TODO: Clamp elements for 64-bit vectors?
1441       // It should only be necessary with variable indexes.
1442       // As a last resort, lower to the stack
1443       .lower();
1444   }
1445 
1446   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1447     .unsupportedIf([=](const LegalityQuery &Query) {
1448         const LLT &EltTy = Query.Types[1].getElementType();
1449         return Query.Types[0] != EltTy;
1450       });
1451 
1452   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1453     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1454     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1455 
1456     // FIXME: Doesn't handle extract of illegal sizes.
1457     getActionDefinitionsBuilder(Op)
1458       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1459       // FIXME: Multiples of 16 should not be legal.
1460       .legalIf([=](const LegalityQuery &Query) {
1461           const LLT BigTy = Query.Types[BigTyIdx];
1462           const LLT LitTy = Query.Types[LitTyIdx];
1463           return (BigTy.getSizeInBits() % 32 == 0) &&
1464                  (LitTy.getSizeInBits() % 16 == 0);
1465         })
1466       .widenScalarIf(
1467         [=](const LegalityQuery &Query) {
1468           const LLT BigTy = Query.Types[BigTyIdx];
1469           return (BigTy.getScalarSizeInBits() < 16);
1470         },
1471         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1472       .widenScalarIf(
1473         [=](const LegalityQuery &Query) {
1474           const LLT LitTy = Query.Types[LitTyIdx];
1475           return (LitTy.getScalarSizeInBits() < 16);
1476         },
1477         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1478       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1479       .widenScalarToNextPow2(BigTyIdx, 32);
1480 
1481   }
1482 
1483   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1484     .legalForCartesianProduct(AllS32Vectors, {S32})
1485     .legalForCartesianProduct(AllS64Vectors, {S64})
1486     .clampNumElements(0, V16S32, V32S32)
1487     .clampNumElements(0, V2S64, V16S64)
1488     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1489 
1490   if (ST.hasScalarPackInsts()) {
1491     BuildVector
1492       // FIXME: Should probably widen s1 vectors straight to s32
1493       .minScalarOrElt(0, S16)
1494       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1495       .minScalar(1, S32);
1496 
1497     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1498       .legalFor({V2S16, S32})
1499       .lower();
1500     BuildVector.minScalarOrElt(0, S32);
1501   } else {
1502     BuildVector.customFor({V2S16, S16});
1503     BuildVector.minScalarOrElt(0, S32);
1504 
1505     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1506       .customFor({V2S16, S32})
1507       .lower();
1508   }
1509 
1510   BuildVector.legalIf(isRegisterType(0));
1511 
1512   // FIXME: Clamp maximum size
1513   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1514     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1515     .clampMaxNumElements(0, S32, 32)
1516     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1517     .clampMaxNumElements(0, S16, 64);
1518 
1519   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1520   // pre-legalize.
1521   if (ST.hasVOP3PInsts()) {
1522     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1523       .customFor({V2S16, V2S16})
1524       .lower();
1525   } else
1526     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1527 
1528   // Merge/Unmerge
1529   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1530     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1531     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1532 
1533     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1534       const LLT Ty = Query.Types[TypeIdx];
1535       if (Ty.isVector()) {
1536         const LLT &EltTy = Ty.getElementType();
1537         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1538           return true;
1539         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1540           return true;
1541       }
1542       return false;
1543     };
1544 
1545     auto &Builder = getActionDefinitionsBuilder(Op)
1546       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1547       .lowerFor({{S16, V2S16}})
1548       .lowerIf([=](const LegalityQuery &Query) {
1549           const LLT BigTy = Query.Types[BigTyIdx];
1550           return BigTy.getSizeInBits() == 32;
1551         })
1552       // Try to widen to s16 first for small types.
1553       // TODO: Only do this on targets with legal s16 shifts
1554       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1555       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1556       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1557       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1558                            elementTypeIs(1, S16)),
1559                        changeTo(1, V2S16))
1560       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1561       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1562       // valid.
1563       .clampScalar(LitTyIdx, S32, S512)
1564       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1565       // Break up vectors with weird elements into scalars
1566       .fewerElementsIf(
1567         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1568         scalarize(0))
1569       .fewerElementsIf(
1570         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1571         scalarize(1))
1572       .clampScalar(BigTyIdx, S32, MaxScalar);
1573 
1574     if (Op == G_MERGE_VALUES) {
1575       Builder.widenScalarIf(
1576         // TODO: Use 16-bit shifts if legal for 8-bit values?
1577         [=](const LegalityQuery &Query) {
1578           const LLT Ty = Query.Types[LitTyIdx];
1579           return Ty.getSizeInBits() < 32;
1580         },
1581         changeTo(LitTyIdx, S32));
1582     }
1583 
1584     Builder.widenScalarIf(
1585       [=](const LegalityQuery &Query) {
1586         const LLT Ty = Query.Types[BigTyIdx];
1587         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1588           Ty.getSizeInBits() % 16 != 0;
1589       },
1590       [=](const LegalityQuery &Query) {
1591         // Pick the next power of 2, or a multiple of 64 over 128.
1592         // Whichever is smaller.
1593         const LLT &Ty = Query.Types[BigTyIdx];
1594         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1595         if (NewSizeInBits >= 256) {
1596           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1597           if (RoundedTo < NewSizeInBits)
1598             NewSizeInBits = RoundedTo;
1599         }
1600         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1601       })
1602       // Any vectors left are the wrong size. Scalarize them.
1603       .scalarize(0)
1604       .scalarize(1);
1605   }
1606 
1607   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1608   // RegBankSelect.
1609   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1610     .legalFor({{S32}, {S64}});
1611 
1612   if (ST.hasVOP3PInsts()) {
1613     SextInReg.lowerFor({{V2S16}})
1614       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1615       // get more vector shift opportunities, since we'll get those when
1616       // expanded.
1617       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1618   } else if (ST.has16BitInsts()) {
1619     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1620   } else {
1621     // Prefer to promote to s32 before lowering if we don't have 16-bit
1622     // shifts. This avoid a lot of intermediate truncate and extend operations.
1623     SextInReg.lowerFor({{S32}, {S64}});
1624   }
1625 
1626   SextInReg
1627     .scalarize(0)
1628     .clampScalar(0, S32, S64)
1629     .lower();
1630 
1631   // TODO: Only Try to form v2s16 with legal packed instructions.
1632   getActionDefinitionsBuilder(G_FSHR)
1633     .legalFor({{S32, S32}})
1634     .lowerFor({{V2S16, V2S16}})
1635     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1636     .scalarize(0)
1637     .lower();
1638 
1639   if (ST.hasVOP3PInsts()) {
1640     getActionDefinitionsBuilder(G_FSHL)
1641       .lowerFor({{V2S16, V2S16}})
1642       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1643       .scalarize(0)
1644       .lower();
1645   } else {
1646     getActionDefinitionsBuilder(G_FSHL)
1647       .scalarize(0)
1648       .lower();
1649   }
1650 
1651   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652     .legalFor({S64});
1653 
1654   getActionDefinitionsBuilder(G_FENCE)
1655     .alwaysLegal();
1656 
1657   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658       .scalarize(0)
1659       .minScalar(0, S32)
1660       .lower();
1661 
1662   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663       .legalFor({{S32, S32}, {S64, S32}})
1664       .clampScalar(1, S32, S32)
1665       .clampScalar(0, S32, S64)
1666       .widenScalarToNextPow2(0)
1667       .scalarize(0);
1668 
1669   getActionDefinitionsBuilder({
1670       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671       G_FCOPYSIGN,
1672 
1673       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674       G_ATOMICRMW_NAND,
1675       G_ATOMICRMW_FSUB,
1676       G_READ_REGISTER,
1677       G_WRITE_REGISTER,
1678 
1679       G_SADDO, G_SSUBO,
1680 
1681        // TODO: Implement
1682       G_FMINIMUM, G_FMAXIMUM}).lower();
1683 
1684   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1685         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1686         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1687     .unsupported();
1688 
1689   getLegacyLegalizerInfo().computeTables();
1690   verify(*ST.getInstrInfo());
1691 }
1692 
1693 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1694                                          MachineInstr &MI) const {
1695   MachineIRBuilder &B = Helper.MIRBuilder;
1696   MachineRegisterInfo &MRI = *B.getMRI();
1697 
1698   switch (MI.getOpcode()) {
1699   case TargetOpcode::G_ADDRSPACE_CAST:
1700     return legalizeAddrSpaceCast(MI, MRI, B);
1701   case TargetOpcode::G_FRINT:
1702     return legalizeFrint(MI, MRI, B);
1703   case TargetOpcode::G_FCEIL:
1704     return legalizeFceil(MI, MRI, B);
1705   case TargetOpcode::G_FREM:
1706     return legalizeFrem(MI, MRI, B);
1707   case TargetOpcode::G_INTRINSIC_TRUNC:
1708     return legalizeIntrinsicTrunc(MI, MRI, B);
1709   case TargetOpcode::G_SITOFP:
1710     return legalizeITOFP(MI, MRI, B, true);
1711   case TargetOpcode::G_UITOFP:
1712     return legalizeITOFP(MI, MRI, B, false);
1713   case TargetOpcode::G_FPTOSI:
1714     return legalizeFPTOI(MI, MRI, B, true);
1715   case TargetOpcode::G_FPTOUI:
1716     return legalizeFPTOI(MI, MRI, B, false);
1717   case TargetOpcode::G_FMINNUM:
1718   case TargetOpcode::G_FMAXNUM:
1719   case TargetOpcode::G_FMINNUM_IEEE:
1720   case TargetOpcode::G_FMAXNUM_IEEE:
1721     return legalizeMinNumMaxNum(Helper, MI);
1722   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1723     return legalizeExtractVectorElt(MI, MRI, B);
1724   case TargetOpcode::G_INSERT_VECTOR_ELT:
1725     return legalizeInsertVectorElt(MI, MRI, B);
1726   case TargetOpcode::G_SHUFFLE_VECTOR:
1727     return legalizeShuffleVector(MI, MRI, B);
1728   case TargetOpcode::G_FSIN:
1729   case TargetOpcode::G_FCOS:
1730     return legalizeSinCos(MI, MRI, B);
1731   case TargetOpcode::G_GLOBAL_VALUE:
1732     return legalizeGlobalValue(MI, MRI, B);
1733   case TargetOpcode::G_LOAD:
1734   case TargetOpcode::G_SEXTLOAD:
1735   case TargetOpcode::G_ZEXTLOAD:
1736     return legalizeLoad(Helper, MI);
1737   case TargetOpcode::G_FMAD:
1738     return legalizeFMad(MI, MRI, B);
1739   case TargetOpcode::G_FDIV:
1740     return legalizeFDIV(MI, MRI, B);
1741   case TargetOpcode::G_UDIV:
1742   case TargetOpcode::G_UREM:
1743   case TargetOpcode::G_UDIVREM:
1744     return legalizeUnsignedDIV_REM(MI, MRI, B);
1745   case TargetOpcode::G_SDIV:
1746   case TargetOpcode::G_SREM:
1747   case TargetOpcode::G_SDIVREM:
1748     return legalizeSignedDIV_REM(MI, MRI, B);
1749   case TargetOpcode::G_ATOMIC_CMPXCHG:
1750     return legalizeAtomicCmpXChg(MI, MRI, B);
1751   case TargetOpcode::G_FLOG:
1752     return legalizeFlog(MI, B, numbers::ln2f);
1753   case TargetOpcode::G_FLOG10:
1754     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1755   case TargetOpcode::G_FEXP:
1756     return legalizeFExp(MI, B);
1757   case TargetOpcode::G_FPOW:
1758     return legalizeFPow(MI, B);
1759   case TargetOpcode::G_FFLOOR:
1760     return legalizeFFloor(MI, MRI, B);
1761   case TargetOpcode::G_BUILD_VECTOR:
1762     return legalizeBuildVector(MI, MRI, B);
1763   default:
1764     return false;
1765   }
1766 
1767   llvm_unreachable("expected switch to return");
1768 }
1769 
1770 Register AMDGPULegalizerInfo::getSegmentAperture(
1771   unsigned AS,
1772   MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B) const {
1774   MachineFunction &MF = B.getMF();
1775   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1776   const LLT S32 = LLT::scalar(32);
1777 
1778   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1779 
1780   if (ST.hasApertureRegs()) {
1781     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1782     // getreg.
1783     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1784         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1785         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1786     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1787         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1788         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1789     unsigned Encoding =
1790         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1791         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1792         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1793 
1794     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1795 
1796     B.buildInstr(AMDGPU::S_GETREG_B32)
1797       .addDef(GetReg)
1798       .addImm(Encoding);
1799     MRI.setType(GetReg, S32);
1800 
1801     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1802     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1803   }
1804 
1805   Register QueuePtr = MRI.createGenericVirtualRegister(
1806     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1807 
1808   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1809     return Register();
1810 
1811   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1812   // private_segment_aperture_base_hi.
1813   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1814 
1815   // TODO: can we be smarter about machine pointer info?
1816   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1817   MachineMemOperand *MMO = MF.getMachineMemOperand(
1818       PtrInfo,
1819       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1820           MachineMemOperand::MOInvariant,
1821       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1822 
1823   Register LoadAddr;
1824 
1825   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1826   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1827 }
1828 
1829 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1830   MachineInstr &MI, MachineRegisterInfo &MRI,
1831   MachineIRBuilder &B) const {
1832   MachineFunction &MF = B.getMF();
1833 
1834   const LLT S32 = LLT::scalar(32);
1835   Register Dst = MI.getOperand(0).getReg();
1836   Register Src = MI.getOperand(1).getReg();
1837 
1838   LLT DstTy = MRI.getType(Dst);
1839   LLT SrcTy = MRI.getType(Src);
1840   unsigned DestAS = DstTy.getAddressSpace();
1841   unsigned SrcAS = SrcTy.getAddressSpace();
1842 
1843   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1844   // vector element.
1845   assert(!DstTy.isVector());
1846 
1847   const AMDGPUTargetMachine &TM
1848     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1849 
1850   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1851     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1852     return true;
1853   }
1854 
1855   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1856     // Truncate.
1857     B.buildExtract(Dst, Src, 0);
1858     MI.eraseFromParent();
1859     return true;
1860   }
1861 
1862   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1864     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1865 
1866     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1867     // another. Merge operands are required to be the same type, but creating an
1868     // extra ptrtoint would be kind of pointless.
1869     auto HighAddr = B.buildConstant(
1870       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1871     B.buildMerge(Dst, {Src, HighAddr});
1872     MI.eraseFromParent();
1873     return true;
1874   }
1875 
1876   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1877     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1878            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1879     unsigned NullVal = TM.getNullPointerValue(DestAS);
1880 
1881     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1882     auto FlatNull = B.buildConstant(SrcTy, 0);
1883 
1884     // Extract low 32-bits of the pointer.
1885     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1886 
1887     auto CmpRes =
1888         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1889     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1890 
1891     MI.eraseFromParent();
1892     return true;
1893   }
1894 
1895   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1896     return false;
1897 
1898   if (!ST.hasFlatAddressSpace())
1899     return false;
1900 
1901   auto SegmentNull =
1902       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1903   auto FlatNull =
1904       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1905 
1906   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1907   if (!ApertureReg.isValid())
1908     return false;
1909 
1910   auto CmpRes =
1911       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1912 
1913   // Coerce the type of the low half of the result so we can use merge_values.
1914   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1915 
1916   // TODO: Should we allow mismatched types but matching sizes in merges to
1917   // avoid the ptrtoint?
1918   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1919   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1920 
1921   MI.eraseFromParent();
1922   return true;
1923 }
1924 
1925 bool AMDGPULegalizerInfo::legalizeFrint(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   Register Src = MI.getOperand(1).getReg();
1929   LLT Ty = MRI.getType(Src);
1930   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1931 
1932   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1933   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1934 
1935   auto C1 = B.buildFConstant(Ty, C1Val);
1936   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1937 
1938   // TODO: Should this propagate fast-math-flags?
1939   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1940   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1941 
1942   auto C2 = B.buildFConstant(Ty, C2Val);
1943   auto Fabs = B.buildFAbs(Ty, Src);
1944 
1945   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1946   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1947   MI.eraseFromParent();
1948   return true;
1949 }
1950 
1951 bool AMDGPULegalizerInfo::legalizeFceil(
1952   MachineInstr &MI, MachineRegisterInfo &MRI,
1953   MachineIRBuilder &B) const {
1954 
1955   const LLT S1 = LLT::scalar(1);
1956   const LLT S64 = LLT::scalar(64);
1957 
1958   Register Src = MI.getOperand(1).getReg();
1959   assert(MRI.getType(Src) == S64);
1960 
1961   // result = trunc(src)
1962   // if (src > 0.0 && src != result)
1963   //   result += 1.0
1964 
1965   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1966 
1967   const auto Zero = B.buildFConstant(S64, 0.0);
1968   const auto One = B.buildFConstant(S64, 1.0);
1969   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1970   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1971   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1972   auto Add = B.buildSelect(S64, And, One, Zero);
1973 
1974   // TODO: Should this propagate fast-math-flags?
1975   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1976   return true;
1977 }
1978 
1979 bool AMDGPULegalizerInfo::legalizeFrem(
1980   MachineInstr &MI, MachineRegisterInfo &MRI,
1981   MachineIRBuilder &B) const {
1982     Register DstReg = MI.getOperand(0).getReg();
1983     Register Src0Reg = MI.getOperand(1).getReg();
1984     Register Src1Reg = MI.getOperand(2).getReg();
1985     auto Flags = MI.getFlags();
1986     LLT Ty = MRI.getType(DstReg);
1987 
1988     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1989     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1990     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1991     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1992     MI.eraseFromParent();
1993     return true;
1994 }
1995 
1996 static MachineInstrBuilder extractF64Exponent(Register Hi,
1997                                               MachineIRBuilder &B) {
1998   const unsigned FractBits = 52;
1999   const unsigned ExpBits = 11;
2000   LLT S32 = LLT::scalar(32);
2001 
2002   auto Const0 = B.buildConstant(S32, FractBits - 32);
2003   auto Const1 = B.buildConstant(S32, ExpBits);
2004 
2005   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2006     .addUse(Hi)
2007     .addUse(Const0.getReg(0))
2008     .addUse(Const1.getReg(0));
2009 
2010   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016   const LLT S1 = LLT::scalar(1);
2017   const LLT S32 = LLT::scalar(32);
2018   const LLT S64 = LLT::scalar(64);
2019 
2020   Register Src = MI.getOperand(1).getReg();
2021   assert(MRI.getType(Src) == S64);
2022 
2023   // TODO: Should this use extract since the low half is unused?
2024   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2025   Register Hi = Unmerge.getReg(1);
2026 
2027   // Extract the upper half, since this is where we will find the sign and
2028   // exponent.
2029   auto Exp = extractF64Exponent(Hi, B);
2030 
2031   const unsigned FractBits = 52;
2032 
2033   // Extract the sign bit.
2034   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2035   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2036 
2037   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2038 
2039   const auto Zero32 = B.buildConstant(S32, 0);
2040 
2041   // Extend back to 64-bits.
2042   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2043 
2044   auto Shr = B.buildAShr(S64, FractMask, Exp);
2045   auto Not = B.buildNot(S64, Shr);
2046   auto Tmp0 = B.buildAnd(S64, Src, Not);
2047   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2048 
2049   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2050   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2051 
2052   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2053   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2054   MI.eraseFromParent();
2055   return true;
2056 }
2057 
2058 bool AMDGPULegalizerInfo::legalizeITOFP(
2059   MachineInstr &MI, MachineRegisterInfo &MRI,
2060   MachineIRBuilder &B, bool Signed) const {
2061 
2062   Register Dst = MI.getOperand(0).getReg();
2063   Register Src = MI.getOperand(1).getReg();
2064 
2065   const LLT S64 = LLT::scalar(64);
2066   const LLT S32 = LLT::scalar(32);
2067 
2068   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2069 
2070   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2071 
2072   auto CvtHi = Signed ?
2073     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2074     B.buildUITOFP(S64, Unmerge.getReg(1));
2075 
2076   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2077 
2078   auto ThirtyTwo = B.buildConstant(S32, 32);
2079   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080     .addUse(CvtHi.getReg(0))
2081     .addUse(ThirtyTwo.getReg(0));
2082 
2083   // TODO: Should this propagate fast-math-flags?
2084   B.buildFAdd(Dst, LdExp, CvtLo);
2085   MI.eraseFromParent();
2086   return true;
2087 }
2088 
2089 // TODO: Copied from DAG implementation. Verify logic and document how this
2090 // actually works.
2091 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2092                                         MachineRegisterInfo &MRI,
2093                                         MachineIRBuilder &B,
2094                                         bool Signed) const {
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Src = MI.getOperand(1).getReg();
2098 
2099   const LLT S64 = LLT::scalar(64);
2100   const LLT S32 = LLT::scalar(32);
2101 
2102   const LLT SrcLT = MRI.getType(Src);
2103   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2104 
2105   unsigned Flags = MI.getFlags();
2106 
2107   // The basic idea of converting a floating point number into a pair of 32-bit
2108   // integers is illustrated as follows:
2109   //
2110   //     tf := trunc(val);
2111   //    hif := floor(tf * 2^-32);
2112   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2113   //     hi := fptoi(hif);
2114   //     lo := fptoi(lof);
2115   //
2116   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2117   MachineInstrBuilder Sign;
2118   if (Signed && SrcLT == S32) {
2119     // However, a 32-bit floating point number has only 23 bits mantissa and
2120     // it's not enough to hold all the significant bits of `lof` if val is
2121     // negative. To avoid the loss of precision, We need to take the absolute
2122     // value after truncating and flip the result back based on the original
2123     // signedness.
2124     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2125     Trunc = B.buildFAbs(S32, Trunc, Flags);
2126   }
2127   MachineInstrBuilder K0, K1;
2128   if (SrcLT == S64) {
2129     K0 = B.buildFConstant(S64,
2130                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2131     K1 = B.buildFConstant(S64,
2132                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2133   } else {
2134     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2135     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2136   }
2137 
2138   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2139   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2140   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2141 
2142   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2143                                      : B.buildFPTOUI(S32, FloorMul);
2144   auto Lo = B.buildFPTOUI(S32, Fma);
2145 
2146   if (Signed && SrcLT == S32) {
2147     // Flip the result based on the signedness, which is either all 0s or 1s.
2148     Sign = B.buildMerge(S64, {Sign, Sign});
2149     // r := xor({lo, hi}, sign) - sign;
2150     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2151   } else
2152     B.buildMerge(Dst, {Lo, Hi});
2153   MI.eraseFromParent();
2154 
2155   return true;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2159                                                MachineInstr &MI) const {
2160   MachineFunction &MF = Helper.MIRBuilder.getMF();
2161   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2162 
2163   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2164                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2165 
2166   // With ieee_mode disabled, the instructions have the correct behavior
2167   // already for G_FMINNUM/G_FMAXNUM
2168   if (!MFI->getMode().IEEE)
2169     return !IsIEEEOp;
2170 
2171   if (IsIEEEOp)
2172     return true;
2173 
2174   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2175 }
2176 
2177 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2178   MachineInstr &MI, MachineRegisterInfo &MRI,
2179   MachineIRBuilder &B) const {
2180   // TODO: Should move some of this into LegalizerHelper.
2181 
2182   // TODO: Promote dynamic indexing of s16 to s32
2183 
2184   // FIXME: Artifact combiner probably should have replaced the truncated
2185   // constant before this, so we shouldn't need
2186   // getConstantVRegValWithLookThrough.
2187   Optional<ValueAndVReg> MaybeIdxVal =
2188       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2189   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2190     return true;
2191   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2192 
2193   Register Dst = MI.getOperand(0).getReg();
2194   Register Vec = MI.getOperand(1).getReg();
2195 
2196   LLT VecTy = MRI.getType(Vec);
2197   LLT EltTy = VecTy.getElementType();
2198   assert(EltTy == MRI.getType(Dst));
2199 
2200   if (IdxVal < VecTy.getNumElements())
2201     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2202   else
2203     B.buildUndef(Dst);
2204 
2205   MI.eraseFromParent();
2206   return true;
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2210   MachineInstr &MI, MachineRegisterInfo &MRI,
2211   MachineIRBuilder &B) const {
2212   // TODO: Should move some of this into LegalizerHelper.
2213 
2214   // TODO: Promote dynamic indexing of s16 to s32
2215 
2216   // FIXME: Artifact combiner probably should have replaced the truncated
2217   // constant before this, so we shouldn't need
2218   // getConstantVRegValWithLookThrough.
2219   Optional<ValueAndVReg> MaybeIdxVal =
2220       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2221   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2222     return true;
2223 
2224   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2225   Register Dst = MI.getOperand(0).getReg();
2226   Register Vec = MI.getOperand(1).getReg();
2227   Register Ins = MI.getOperand(2).getReg();
2228 
2229   LLT VecTy = MRI.getType(Vec);
2230   LLT EltTy = VecTy.getElementType();
2231   assert(EltTy == MRI.getType(Ins));
2232 
2233   if (IdxVal < VecTy.getNumElements())
2234     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2235   else
2236     B.buildUndef(Dst);
2237 
2238   MI.eraseFromParent();
2239   return true;
2240 }
2241 
2242 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2243   MachineInstr &MI, MachineRegisterInfo &MRI,
2244   MachineIRBuilder &B) const {
2245   const LLT V2S16 = LLT::fixed_vector(2, 16);
2246 
2247   Register Dst = MI.getOperand(0).getReg();
2248   Register Src0 = MI.getOperand(1).getReg();
2249   LLT DstTy = MRI.getType(Dst);
2250   LLT SrcTy = MRI.getType(Src0);
2251 
2252   if (SrcTy == V2S16 && DstTy == V2S16 &&
2253       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2254     return true;
2255 
2256   MachineIRBuilder HelperBuilder(MI);
2257   GISelObserverWrapper DummyObserver;
2258   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2259   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2260 }
2261 
2262 bool AMDGPULegalizerInfo::legalizeSinCos(
2263   MachineInstr &MI, MachineRegisterInfo &MRI,
2264   MachineIRBuilder &B) const {
2265 
2266   Register DstReg = MI.getOperand(0).getReg();
2267   Register SrcReg = MI.getOperand(1).getReg();
2268   LLT Ty = MRI.getType(DstReg);
2269   unsigned Flags = MI.getFlags();
2270 
2271   Register TrigVal;
2272   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2273   if (ST.hasTrigReducedRange()) {
2274     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2275     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2276       .addUse(MulVal.getReg(0))
2277       .setMIFlags(Flags).getReg(0);
2278   } else
2279     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2280 
2281   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2282     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2283   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2284     .addUse(TrigVal)
2285     .setMIFlags(Flags);
2286   MI.eraseFromParent();
2287   return true;
2288 }
2289 
2290 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2291                                                   MachineIRBuilder &B,
2292                                                   const GlobalValue *GV,
2293                                                   int64_t Offset,
2294                                                   unsigned GAFlags) const {
2295   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2296   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2297   // to the following code sequence:
2298   //
2299   // For constant address space:
2300   //   s_getpc_b64 s[0:1]
2301   //   s_add_u32 s0, s0, $symbol
2302   //   s_addc_u32 s1, s1, 0
2303   //
2304   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2305   //   a fixup or relocation is emitted to replace $symbol with a literal
2306   //   constant, which is a pc-relative offset from the encoding of the $symbol
2307   //   operand to the global variable.
2308   //
2309   // For global address space:
2310   //   s_getpc_b64 s[0:1]
2311   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2312   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2313   //
2314   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2315   //   fixups or relocations are emitted to replace $symbol@*@lo and
2316   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2317   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2318   //   operand to the global variable.
2319   //
2320   // What we want here is an offset from the value returned by s_getpc
2321   // (which is the address of the s_add_u32 instruction) to the global
2322   // variable, but since the encoding of $symbol starts 4 bytes after the start
2323   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2324   // small. This requires us to add 4 to the global variable offset in order to
2325   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2326   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2327   // instruction.
2328 
2329   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2330 
2331   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2332     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2333 
2334   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2335     .addDef(PCReg);
2336 
2337   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2338   if (GAFlags == SIInstrInfo::MO_NONE)
2339     MIB.addImm(0);
2340   else
2341     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2342 
2343   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2344 
2345   if (PtrTy.getSizeInBits() == 32)
2346     B.buildExtract(DstReg, PCReg, 0);
2347   return true;
2348  }
2349 
2350 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2351   MachineInstr &MI, MachineRegisterInfo &MRI,
2352   MachineIRBuilder &B) const {
2353   Register DstReg = MI.getOperand(0).getReg();
2354   LLT Ty = MRI.getType(DstReg);
2355   unsigned AS = Ty.getAddressSpace();
2356 
2357   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2358   MachineFunction &MF = B.getMF();
2359   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2360 
2361   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2362     if (!MFI->isModuleEntryFunction() &&
2363         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2364       const Function &Fn = MF.getFunction();
2365       DiagnosticInfoUnsupported BadLDSDecl(
2366         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2367         DS_Warning);
2368       Fn.getContext().diagnose(BadLDSDecl);
2369 
2370       // We currently don't have a way to correctly allocate LDS objects that
2371       // aren't directly associated with a kernel. We do force inlining of
2372       // functions that use local objects. However, if these dead functions are
2373       // not eliminated, we don't want a compile time error. Just emit a warning
2374       // and a trap, since there should be no callable path here.
2375       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2376       B.buildUndef(DstReg);
2377       MI.eraseFromParent();
2378       return true;
2379     }
2380 
2381     // TODO: We could emit code to handle the initialization somewhere.
2382     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2383       const SITargetLowering *TLI = ST.getTargetLowering();
2384       if (!TLI->shouldUseLDSConstAddress(GV)) {
2385         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2386         return true; // Leave in place;
2387       }
2388 
2389       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2390         Type *Ty = GV->getValueType();
2391         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2392         // zero-sized type in other languages to declare the dynamic shared
2393         // memory which size is not known at the compile time. They will be
2394         // allocated by the runtime and placed directly after the static
2395         // allocated ones. They all share the same offset.
2396         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2397           // Adjust alignment for that dynamic shared memory array.
2398           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2399           LLT S32 = LLT::scalar(32);
2400           auto Sz =
2401               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2402           B.buildIntToPtr(DstReg, Sz);
2403           MI.eraseFromParent();
2404           return true;
2405         }
2406       }
2407 
2408       B.buildConstant(
2409           DstReg,
2410           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2411       MI.eraseFromParent();
2412       return true;
2413     }
2414 
2415     const Function &Fn = MF.getFunction();
2416     DiagnosticInfoUnsupported BadInit(
2417       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2418     Fn.getContext().diagnose(BadInit);
2419     return true;
2420   }
2421 
2422   const SITargetLowering *TLI = ST.getTargetLowering();
2423 
2424   if (TLI->shouldEmitFixup(GV)) {
2425     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2426     MI.eraseFromParent();
2427     return true;
2428   }
2429 
2430   if (TLI->shouldEmitPCReloc(GV)) {
2431     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2432     MI.eraseFromParent();
2433     return true;
2434   }
2435 
2436   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2437   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2438 
2439   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2440   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2441       MachinePointerInfo::getGOT(MF),
2442       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2443           MachineMemOperand::MOInvariant,
2444       LoadTy, Align(8));
2445 
2446   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2447 
2448   if (Ty.getSizeInBits() == 32) {
2449     // Truncate if this is a 32-bit constant adrdess.
2450     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2451     B.buildExtract(DstReg, Load, 0);
2452   } else
2453     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2454 
2455   MI.eraseFromParent();
2456   return true;
2457 }
2458 
2459 static LLT widenToNextPowerOf2(LLT Ty) {
2460   if (Ty.isVector())
2461     return Ty.changeElementCount(
2462         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2463   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2464 }
2465 
2466 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2467                                        MachineInstr &MI) const {
2468   MachineIRBuilder &B = Helper.MIRBuilder;
2469   MachineRegisterInfo &MRI = *B.getMRI();
2470   GISelChangeObserver &Observer = Helper.Observer;
2471 
2472   Register PtrReg = MI.getOperand(1).getReg();
2473   LLT PtrTy = MRI.getType(PtrReg);
2474   unsigned AddrSpace = PtrTy.getAddressSpace();
2475 
2476   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2477     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2478     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2479     Observer.changingInstr(MI);
2480     MI.getOperand(1).setReg(Cast.getReg(0));
2481     Observer.changedInstr(MI);
2482     return true;
2483   }
2484 
2485   if (MI.getOpcode() != AMDGPU::G_LOAD)
2486     return false;
2487 
2488   Register ValReg = MI.getOperand(0).getReg();
2489   LLT ValTy = MRI.getType(ValReg);
2490 
2491   MachineMemOperand *MMO = *MI.memoperands_begin();
2492   const unsigned ValSize = ValTy.getSizeInBits();
2493   const LLT MemTy = MMO->getMemoryType();
2494   const Align MemAlign = MMO->getAlign();
2495   const unsigned MemSize = MemTy.getSizeInBits();
2496   const unsigned AlignInBits = 8 * MemAlign.value();
2497 
2498   // Widen non-power-of-2 loads to the alignment if needed
2499   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2500     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2501 
2502     // This was already the correct extending load result type, so just adjust
2503     // the memory type.
2504     if (WideMemSize == ValSize) {
2505       MachineFunction &MF = B.getMF();
2506 
2507       MachineMemOperand *WideMMO =
2508           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2509       Observer.changingInstr(MI);
2510       MI.setMemRefs(MF, {WideMMO});
2511       Observer.changedInstr(MI);
2512       return true;
2513     }
2514 
2515     // Don't bother handling edge case that should probably never be produced.
2516     if (ValSize > WideMemSize)
2517       return false;
2518 
2519     LLT WideTy = widenToNextPowerOf2(ValTy);
2520 
2521     Register WideLoad;
2522     if (!WideTy.isVector()) {
2523       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2524       B.buildTrunc(ValReg, WideLoad).getReg(0);
2525     } else {
2526       // Extract the subvector.
2527 
2528       if (isRegisterType(ValTy)) {
2529         // If this a case where G_EXTRACT is legal, use it.
2530         // (e.g. <3 x s32> -> <4 x s32>)
2531         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2532         B.buildExtract(ValReg, WideLoad, 0);
2533       } else {
2534         // For cases where the widened type isn't a nice register value, unmerge
2535         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2536         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2537         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2538         B.setInsertPt(B.getMBB(), MI.getIterator());
2539         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2540       }
2541     }
2542 
2543     MI.eraseFromParent();
2544     return true;
2545   }
2546 
2547   return false;
2548 }
2549 
2550 bool AMDGPULegalizerInfo::legalizeFMad(
2551   MachineInstr &MI, MachineRegisterInfo &MRI,
2552   MachineIRBuilder &B) const {
2553   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2554   assert(Ty.isScalar());
2555 
2556   MachineFunction &MF = B.getMF();
2557   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2558 
2559   // TODO: Always legal with future ftz flag.
2560   // FIXME: Do we need just output?
2561   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2562     return true;
2563   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2564     return true;
2565 
2566   MachineIRBuilder HelperBuilder(MI);
2567   GISelObserverWrapper DummyObserver;
2568   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2569   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2570 }
2571 
2572 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2573   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2574   Register DstReg = MI.getOperand(0).getReg();
2575   Register PtrReg = MI.getOperand(1).getReg();
2576   Register CmpVal = MI.getOperand(2).getReg();
2577   Register NewVal = MI.getOperand(3).getReg();
2578 
2579   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2580          "this should not have been custom lowered");
2581 
2582   LLT ValTy = MRI.getType(CmpVal);
2583   LLT VecTy = LLT::fixed_vector(2, ValTy);
2584 
2585   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2586 
2587   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2588     .addDef(DstReg)
2589     .addUse(PtrReg)
2590     .addUse(PackedVal)
2591     .setMemRefs(MI.memoperands());
2592 
2593   MI.eraseFromParent();
2594   return true;
2595 }
2596 
2597 bool AMDGPULegalizerInfo::legalizeFlog(
2598   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2599   Register Dst = MI.getOperand(0).getReg();
2600   Register Src = MI.getOperand(1).getReg();
2601   LLT Ty = B.getMRI()->getType(Dst);
2602   unsigned Flags = MI.getFlags();
2603 
2604   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2605   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2606 
2607   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2608   MI.eraseFromParent();
2609   return true;
2610 }
2611 
2612 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2613                                        MachineIRBuilder &B) const {
2614   Register Dst = MI.getOperand(0).getReg();
2615   Register Src = MI.getOperand(1).getReg();
2616   unsigned Flags = MI.getFlags();
2617   LLT Ty = B.getMRI()->getType(Dst);
2618 
2619   auto K = B.buildFConstant(Ty, numbers::log2e);
2620   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2621   B.buildFExp2(Dst, Mul, Flags);
2622   MI.eraseFromParent();
2623   return true;
2624 }
2625 
2626 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2627                                        MachineIRBuilder &B) const {
2628   Register Dst = MI.getOperand(0).getReg();
2629   Register Src0 = MI.getOperand(1).getReg();
2630   Register Src1 = MI.getOperand(2).getReg();
2631   unsigned Flags = MI.getFlags();
2632   LLT Ty = B.getMRI()->getType(Dst);
2633   const LLT S16 = LLT::scalar(16);
2634   const LLT S32 = LLT::scalar(32);
2635 
2636   if (Ty == S32) {
2637     auto Log = B.buildFLog2(S32, Src0, Flags);
2638     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2639       .addUse(Log.getReg(0))
2640       .addUse(Src1)
2641       .setMIFlags(Flags);
2642     B.buildFExp2(Dst, Mul, Flags);
2643   } else if (Ty == S16) {
2644     // There's no f16 fmul_legacy, so we need to convert for it.
2645     auto Log = B.buildFLog2(S16, Src0, Flags);
2646     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2647     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2648     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2649       .addUse(Ext0.getReg(0))
2650       .addUse(Ext1.getReg(0))
2651       .setMIFlags(Flags);
2652 
2653     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2654   } else
2655     return false;
2656 
2657   MI.eraseFromParent();
2658   return true;
2659 }
2660 
2661 // Find a source register, ignoring any possible source modifiers.
2662 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2663   Register ModSrc = OrigSrc;
2664   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2665     ModSrc = SrcFNeg->getOperand(1).getReg();
2666     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2667       ModSrc = SrcFAbs->getOperand(1).getReg();
2668   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2669     ModSrc = SrcFAbs->getOperand(1).getReg();
2670   return ModSrc;
2671 }
2672 
2673 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2674                                          MachineRegisterInfo &MRI,
2675                                          MachineIRBuilder &B) const {
2676 
2677   const LLT S1 = LLT::scalar(1);
2678   const LLT S64 = LLT::scalar(64);
2679   Register Dst = MI.getOperand(0).getReg();
2680   Register OrigSrc = MI.getOperand(1).getReg();
2681   unsigned Flags = MI.getFlags();
2682   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2683          "this should not have been custom lowered");
2684 
2685   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2686   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2687   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2688   // V_FRACT bug is:
2689   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2690   //
2691   // Convert floor(x) to (x - fract(x))
2692 
2693   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2694     .addUse(OrigSrc)
2695     .setMIFlags(Flags);
2696 
2697   // Give source modifier matching some assistance before obscuring a foldable
2698   // pattern.
2699 
2700   // TODO: We can avoid the neg on the fract? The input sign to fract
2701   // shouldn't matter?
2702   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2703 
2704   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2705 
2706   Register Min = MRI.createGenericVirtualRegister(S64);
2707 
2708   // We don't need to concern ourselves with the snan handling difference, so
2709   // use the one which will directly select.
2710   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2711   if (MFI->getMode().IEEE)
2712     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2713   else
2714     B.buildFMinNum(Min, Fract, Const, Flags);
2715 
2716   Register CorrectedFract = Min;
2717   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2718     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2719     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2720   }
2721 
2722   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2723   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2724 
2725   MI.eraseFromParent();
2726   return true;
2727 }
2728 
2729 // Turn an illegal packed v2s16 build vector into bit operations.
2730 // TODO: This should probably be a bitcast action in LegalizerHelper.
2731 bool AMDGPULegalizerInfo::legalizeBuildVector(
2732   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2733   Register Dst = MI.getOperand(0).getReg();
2734   const LLT S32 = LLT::scalar(32);
2735   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2736 
2737   Register Src0 = MI.getOperand(1).getReg();
2738   Register Src1 = MI.getOperand(2).getReg();
2739   assert(MRI.getType(Src0) == LLT::scalar(16));
2740 
2741   auto Merge = B.buildMerge(S32, {Src0, Src1});
2742   B.buildBitcast(Dst, Merge);
2743 
2744   MI.eraseFromParent();
2745   return true;
2746 }
2747 
2748 // Check that this is a G_XOR x, -1
2749 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2750   if (MI.getOpcode() != TargetOpcode::G_XOR)
2751     return false;
2752   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2753   return ConstVal && *ConstVal == -1;
2754 }
2755 
2756 // Return the use branch instruction, otherwise null if the usage is invalid.
2757 static MachineInstr *
2758 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2759                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2760   Register CondDef = MI.getOperand(0).getReg();
2761   if (!MRI.hasOneNonDBGUse(CondDef))
2762     return nullptr;
2763 
2764   MachineBasicBlock *Parent = MI.getParent();
2765   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2766 
2767   if (isNot(MRI, *UseMI)) {
2768     Register NegatedCond = UseMI->getOperand(0).getReg();
2769     if (!MRI.hasOneNonDBGUse(NegatedCond))
2770       return nullptr;
2771 
2772     // We're deleting the def of this value, so we need to remove it.
2773     UseMI->eraseFromParent();
2774 
2775     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2776     Negated = true;
2777   }
2778 
2779   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2780     return nullptr;
2781 
2782   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2783   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2784   if (Next == Parent->end()) {
2785     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2786     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2787       return nullptr;
2788     UncondBrTarget = &*NextMBB;
2789   } else {
2790     if (Next->getOpcode() != AMDGPU::G_BR)
2791       return nullptr;
2792     Br = &*Next;
2793     UncondBrTarget = Br->getOperand(0).getMBB();
2794   }
2795 
2796   return UseMI;
2797 }
2798 
2799 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2800                                          const ArgDescriptor *Arg,
2801                                          const TargetRegisterClass *ArgRC,
2802                                          LLT ArgTy) const {
2803   MCRegister SrcReg = Arg->getRegister();
2804   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2805   assert(DstReg.isVirtual() && "Virtual register expected");
2806 
2807   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2808                                              ArgTy);
2809   if (Arg->isMasked()) {
2810     // TODO: Should we try to emit this once in the entry block?
2811     const LLT S32 = LLT::scalar(32);
2812     const unsigned Mask = Arg->getMask();
2813     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2814 
2815     Register AndMaskSrc = LiveIn;
2816 
2817     if (Shift != 0) {
2818       auto ShiftAmt = B.buildConstant(S32, Shift);
2819       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2820     }
2821 
2822     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2823   } else {
2824     B.buildCopy(DstReg, LiveIn);
2825   }
2826 
2827   return true;
2828 }
2829 
2830 bool AMDGPULegalizerInfo::loadInputValue(
2831     Register DstReg, MachineIRBuilder &B,
2832     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2833   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2834   const ArgDescriptor *Arg;
2835   const TargetRegisterClass *ArgRC;
2836   LLT ArgTy;
2837   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2838 
2839   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2840     return false; // TODO: Handle these
2841   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2842 }
2843 
2844 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2845     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2846     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2847   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2848     return false;
2849 
2850   MI.eraseFromParent();
2851   return true;
2852 }
2853 
2854 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2855                                        MachineRegisterInfo &MRI,
2856                                        MachineIRBuilder &B) const {
2857   Register Dst = MI.getOperand(0).getReg();
2858   LLT DstTy = MRI.getType(Dst);
2859   LLT S16 = LLT::scalar(16);
2860   LLT S32 = LLT::scalar(32);
2861   LLT S64 = LLT::scalar(64);
2862 
2863   if (DstTy == S16)
2864     return legalizeFDIV16(MI, MRI, B);
2865   if (DstTy == S32)
2866     return legalizeFDIV32(MI, MRI, B);
2867   if (DstTy == S64)
2868     return legalizeFDIV64(MI, MRI, B);
2869 
2870   return false;
2871 }
2872 
2873 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2874                                                         Register DstDivReg,
2875                                                         Register DstRemReg,
2876                                                         Register X,
2877                                                         Register Y) const {
2878   const LLT S1 = LLT::scalar(1);
2879   const LLT S32 = LLT::scalar(32);
2880 
2881   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2882   // algorithm used here.
2883 
2884   // Initial estimate of inv(y).
2885   auto FloatY = B.buildUITOFP(S32, Y);
2886   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2887   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2888   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2889   auto Z = B.buildFPTOUI(S32, ScaledY);
2890 
2891   // One round of UNR.
2892   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2893   auto NegYZ = B.buildMul(S32, NegY, Z);
2894   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2895 
2896   // Quotient/remainder estimate.
2897   auto Q = B.buildUMulH(S32, X, Z);
2898   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2899 
2900   // First quotient/remainder refinement.
2901   auto One = B.buildConstant(S32, 1);
2902   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2903   if (DstDivReg)
2904     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2905   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2906 
2907   // Second quotient/remainder refinement.
2908   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2909   if (DstDivReg)
2910     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2911 
2912   if (DstRemReg)
2913     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2914 }
2915 
2916 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2917 //
2918 // Return lo, hi of result
2919 //
2920 // %cvt.lo = G_UITOFP Val.lo
2921 // %cvt.hi = G_UITOFP Val.hi
2922 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2923 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2924 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2925 // %mul2 = G_FMUL %mul1, 2**(-32)
2926 // %trunc = G_INTRINSIC_TRUNC %mul2
2927 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2928 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2929 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2930                                                        Register Val) {
2931   const LLT S32 = LLT::scalar(32);
2932   auto Unmerge = B.buildUnmerge(S32, Val);
2933 
2934   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2935   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2936 
2937   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2938                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2939 
2940   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2941   auto Mul1 =
2942       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2943 
2944   // 2**(-32)
2945   auto Mul2 =
2946       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2947   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2948 
2949   // -(2**32)
2950   auto Mad2 = B.buildFMAD(S32, Trunc,
2951                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2952 
2953   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2954   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2955 
2956   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2957 }
2958 
2959 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2960                                                         Register DstDivReg,
2961                                                         Register DstRemReg,
2962                                                         Register Numer,
2963                                                         Register Denom) const {
2964   const LLT S32 = LLT::scalar(32);
2965   const LLT S64 = LLT::scalar(64);
2966   const LLT S1 = LLT::scalar(1);
2967   Register RcpLo, RcpHi;
2968 
2969   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2970 
2971   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2972 
2973   auto Zero64 = B.buildConstant(S64, 0);
2974   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2975 
2976   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2977   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2978 
2979   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2980   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2981   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2982 
2983   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2984   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2985   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2986   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2987 
2988   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2989   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2990   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2991   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2992   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2993 
2994   auto Zero32 = B.buildConstant(S32, 0);
2995   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2996   auto Add2_HiC =
2997       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2998   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2999   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3000 
3001   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3002   Register NumerLo = UnmergeNumer.getReg(0);
3003   Register NumerHi = UnmergeNumer.getReg(1);
3004 
3005   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3006   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3007   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3008   Register Mul3_Lo = UnmergeMul3.getReg(0);
3009   Register Mul3_Hi = UnmergeMul3.getReg(1);
3010   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3011   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3012   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3013   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3014 
3015   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3016   Register DenomLo = UnmergeDenom.getReg(0);
3017   Register DenomHi = UnmergeDenom.getReg(1);
3018 
3019   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3020   auto C1 = B.buildSExt(S32, CmpHi);
3021 
3022   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3023   auto C2 = B.buildSExt(S32, CmpLo);
3024 
3025   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3026   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3027 
3028   // TODO: Here and below portions of the code can be enclosed into if/endif.
3029   // Currently control flow is unconditional and we have 4 selects after
3030   // potential endif to substitute PHIs.
3031 
3032   // if C3 != 0 ...
3033   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3034   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3035   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3036   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3037 
3038   auto One64 = B.buildConstant(S64, 1);
3039   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3040 
3041   auto C4 =
3042       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3043   auto C5 =
3044       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3045   auto C6 = B.buildSelect(
3046       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3047 
3048   // if (C6 != 0)
3049   auto Add4 = B.buildAdd(S64, Add3, One64);
3050   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3051 
3052   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3053   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3054   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3055 
3056   // endif C6
3057   // endif C3
3058 
3059   if (DstDivReg) {
3060     auto Sel1 = B.buildSelect(
3061         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3062     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3063                   Sel1, MulHi3);
3064   }
3065 
3066   if (DstRemReg) {
3067     auto Sel2 = B.buildSelect(
3068         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3069     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3070                   Sel2, Sub1);
3071   }
3072 }
3073 
3074 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3075                                                   MachineRegisterInfo &MRI,
3076                                                   MachineIRBuilder &B) const {
3077   Register DstDivReg, DstRemReg;
3078   switch (MI.getOpcode()) {
3079   default:
3080     llvm_unreachable("Unexpected opcode!");
3081   case AMDGPU::G_UDIV: {
3082     DstDivReg = MI.getOperand(0).getReg();
3083     break;
3084   }
3085   case AMDGPU::G_UREM: {
3086     DstRemReg = MI.getOperand(0).getReg();
3087     break;
3088   }
3089   case AMDGPU::G_UDIVREM: {
3090     DstDivReg = MI.getOperand(0).getReg();
3091     DstRemReg = MI.getOperand(1).getReg();
3092     break;
3093   }
3094   }
3095 
3096   const LLT S64 = LLT::scalar(64);
3097   const LLT S32 = LLT::scalar(32);
3098   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3099   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3100   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3101   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3102 
3103   if (Ty == S32)
3104     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3105   else if (Ty == S64)
3106     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3107   else
3108     return false;
3109 
3110   MI.eraseFromParent();
3111   return true;
3112 }
3113 
3114 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3115                                                 MachineRegisterInfo &MRI,
3116                                                 MachineIRBuilder &B) const {
3117   const LLT S64 = LLT::scalar(64);
3118   const LLT S32 = LLT::scalar(32);
3119 
3120   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3121   if (Ty != S32 && Ty != S64)
3122     return false;
3123 
3124   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3125   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3126   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3127 
3128   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3129   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3130   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3131 
3132   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3133   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3134 
3135   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3136   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3137 
3138   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3139   switch (MI.getOpcode()) {
3140   default:
3141     llvm_unreachable("Unexpected opcode!");
3142   case AMDGPU::G_SDIV: {
3143     DstDivReg = MI.getOperand(0).getReg();
3144     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3145     break;
3146   }
3147   case AMDGPU::G_SREM: {
3148     DstRemReg = MI.getOperand(0).getReg();
3149     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3150     break;
3151   }
3152   case AMDGPU::G_SDIVREM: {
3153     DstDivReg = MI.getOperand(0).getReg();
3154     DstRemReg = MI.getOperand(1).getReg();
3155     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3156     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3157     break;
3158   }
3159   }
3160 
3161   if (Ty == S32)
3162     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3163   else
3164     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3165 
3166   if (DstDivReg) {
3167     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3168     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3169     B.buildSub(DstDivReg, SignXor, Sign);
3170   }
3171 
3172   if (DstRemReg) {
3173     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3174     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3175     B.buildSub(DstRemReg, SignXor, Sign);
3176   }
3177 
3178   MI.eraseFromParent();
3179   return true;
3180 }
3181 
3182 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3183                                                  MachineRegisterInfo &MRI,
3184                                                  MachineIRBuilder &B) const {
3185   Register Res = MI.getOperand(0).getReg();
3186   Register LHS = MI.getOperand(1).getReg();
3187   Register RHS = MI.getOperand(2).getReg();
3188   uint16_t Flags = MI.getFlags();
3189   LLT ResTy = MRI.getType(Res);
3190 
3191   const MachineFunction &MF = B.getMF();
3192   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3193                             MI.getFlag(MachineInstr::FmAfn);
3194 
3195   if (!AllowInaccurateRcp)
3196     return false;
3197 
3198   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3199     // 1 / x -> RCP(x)
3200     if (CLHS->isExactlyValue(1.0)) {
3201       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3202         .addUse(RHS)
3203         .setMIFlags(Flags);
3204 
3205       MI.eraseFromParent();
3206       return true;
3207     }
3208 
3209     // -1 / x -> RCP( FNEG(x) )
3210     if (CLHS->isExactlyValue(-1.0)) {
3211       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3212       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3213         .addUse(FNeg.getReg(0))
3214         .setMIFlags(Flags);
3215 
3216       MI.eraseFromParent();
3217       return true;
3218     }
3219   }
3220 
3221   // x / y -> x * (1.0 / y)
3222   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3223     .addUse(RHS)
3224     .setMIFlags(Flags);
3225   B.buildFMul(Res, LHS, RCP, Flags);
3226 
3227   MI.eraseFromParent();
3228   return true;
3229 }
3230 
3231 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3232                                                    MachineRegisterInfo &MRI,
3233                                                    MachineIRBuilder &B) const {
3234   Register Res = MI.getOperand(0).getReg();
3235   Register X = MI.getOperand(1).getReg();
3236   Register Y = MI.getOperand(2).getReg();
3237   uint16_t Flags = MI.getFlags();
3238   LLT ResTy = MRI.getType(Res);
3239 
3240   const MachineFunction &MF = B.getMF();
3241   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3242                             MI.getFlag(MachineInstr::FmAfn);
3243 
3244   if (!AllowInaccurateRcp)
3245     return false;
3246 
3247   auto NegY = B.buildFNeg(ResTy, Y);
3248   auto One = B.buildFConstant(ResTy, 1.0);
3249 
3250   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3251     .addUse(Y)
3252     .setMIFlags(Flags);
3253 
3254   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3255   R = B.buildFMA(ResTy, Tmp0, R, R);
3256 
3257   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3258   R = B.buildFMA(ResTy, Tmp1, R, R);
3259 
3260   auto Ret = B.buildFMul(ResTy, X, R);
3261   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3262 
3263   B.buildFMA(Res, Tmp2, R, Ret);
3264   MI.eraseFromParent();
3265   return true;
3266 }
3267 
3268 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3269                                          MachineRegisterInfo &MRI,
3270                                          MachineIRBuilder &B) const {
3271   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3272     return true;
3273 
3274   Register Res = MI.getOperand(0).getReg();
3275   Register LHS = MI.getOperand(1).getReg();
3276   Register RHS = MI.getOperand(2).getReg();
3277 
3278   uint16_t Flags = MI.getFlags();
3279 
3280   LLT S16 = LLT::scalar(16);
3281   LLT S32 = LLT::scalar(32);
3282 
3283   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3284   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3285 
3286   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3287     .addUse(RHSExt.getReg(0))
3288     .setMIFlags(Flags);
3289 
3290   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3291   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3292 
3293   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3294     .addUse(RDst.getReg(0))
3295     .addUse(RHS)
3296     .addUse(LHS)
3297     .setMIFlags(Flags);
3298 
3299   MI.eraseFromParent();
3300   return true;
3301 }
3302 
3303 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3304 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3305 static void toggleSPDenormMode(bool Enable,
3306                                MachineIRBuilder &B,
3307                                const GCNSubtarget &ST,
3308                                AMDGPU::SIModeRegisterDefaults Mode) {
3309   // Set SP denorm mode to this value.
3310   unsigned SPDenormMode =
3311     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3312 
3313   if (ST.hasDenormModeInst()) {
3314     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3315     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3316 
3317     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3318     B.buildInstr(AMDGPU::S_DENORM_MODE)
3319       .addImm(NewDenormModeValue);
3320 
3321   } else {
3322     // Select FP32 bit field in mode register.
3323     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3324                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3325                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3326 
3327     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3328       .addImm(SPDenormMode)
3329       .addImm(SPDenormModeBitField);
3330   }
3331 }
3332 
3333 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3334                                          MachineRegisterInfo &MRI,
3335                                          MachineIRBuilder &B) const {
3336   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3337     return true;
3338 
3339   Register Res = MI.getOperand(0).getReg();
3340   Register LHS = MI.getOperand(1).getReg();
3341   Register RHS = MI.getOperand(2).getReg();
3342   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3343   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3344 
3345   uint16_t Flags = MI.getFlags();
3346 
3347   LLT S32 = LLT::scalar(32);
3348   LLT S1 = LLT::scalar(1);
3349 
3350   auto One = B.buildFConstant(S32, 1.0f);
3351 
3352   auto DenominatorScaled =
3353     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3354       .addUse(LHS)
3355       .addUse(RHS)
3356       .addImm(0)
3357       .setMIFlags(Flags);
3358   auto NumeratorScaled =
3359     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3360       .addUse(LHS)
3361       .addUse(RHS)
3362       .addImm(1)
3363       .setMIFlags(Flags);
3364 
3365   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3366     .addUse(DenominatorScaled.getReg(0))
3367     .setMIFlags(Flags);
3368   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3369 
3370   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3371   // aren't modeled as reading it.
3372   if (!Mode.allFP32Denormals())
3373     toggleSPDenormMode(true, B, ST, Mode);
3374 
3375   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3376   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3377   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3378   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3379   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3380   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3381 
3382   if (!Mode.allFP32Denormals())
3383     toggleSPDenormMode(false, B, ST, Mode);
3384 
3385   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3386     .addUse(Fma4.getReg(0))
3387     .addUse(Fma1.getReg(0))
3388     .addUse(Fma3.getReg(0))
3389     .addUse(NumeratorScaled.getReg(1))
3390     .setMIFlags(Flags);
3391 
3392   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3393     .addUse(Fmas.getReg(0))
3394     .addUse(RHS)
3395     .addUse(LHS)
3396     .setMIFlags(Flags);
3397 
3398   MI.eraseFromParent();
3399   return true;
3400 }
3401 
3402 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3403                                          MachineRegisterInfo &MRI,
3404                                          MachineIRBuilder &B) const {
3405   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3406     return true;
3407 
3408   Register Res = MI.getOperand(0).getReg();
3409   Register LHS = MI.getOperand(1).getReg();
3410   Register RHS = MI.getOperand(2).getReg();
3411 
3412   uint16_t Flags = MI.getFlags();
3413 
3414   LLT S64 = LLT::scalar(64);
3415   LLT S1 = LLT::scalar(1);
3416 
3417   auto One = B.buildFConstant(S64, 1.0);
3418 
3419   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3420     .addUse(LHS)
3421     .addUse(RHS)
3422     .addImm(0)
3423     .setMIFlags(Flags);
3424 
3425   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3426 
3427   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3428     .addUse(DivScale0.getReg(0))
3429     .setMIFlags(Flags);
3430 
3431   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3432   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3433   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3434 
3435   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3436     .addUse(LHS)
3437     .addUse(RHS)
3438     .addImm(1)
3439     .setMIFlags(Flags);
3440 
3441   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3442   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3443   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3444 
3445   Register Scale;
3446   if (!ST.hasUsableDivScaleConditionOutput()) {
3447     // Workaround a hardware bug on SI where the condition output from div_scale
3448     // is not usable.
3449 
3450     LLT S32 = LLT::scalar(32);
3451 
3452     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3453     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3454     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3455     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3456 
3457     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3458                               Scale1Unmerge.getReg(1));
3459     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3460                               Scale0Unmerge.getReg(1));
3461     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3462   } else {
3463     Scale = DivScale1.getReg(1);
3464   }
3465 
3466   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3467     .addUse(Fma4.getReg(0))
3468     .addUse(Fma3.getReg(0))
3469     .addUse(Mul.getReg(0))
3470     .addUse(Scale)
3471     .setMIFlags(Flags);
3472 
3473   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3474     .addUse(Fmas.getReg(0))
3475     .addUse(RHS)
3476     .addUse(LHS)
3477     .setMIFlags(Flags);
3478 
3479   MI.eraseFromParent();
3480   return true;
3481 }
3482 
3483 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3484                                                  MachineRegisterInfo &MRI,
3485                                                  MachineIRBuilder &B) const {
3486   Register Res = MI.getOperand(0).getReg();
3487   Register LHS = MI.getOperand(2).getReg();
3488   Register RHS = MI.getOperand(3).getReg();
3489   uint16_t Flags = MI.getFlags();
3490 
3491   LLT S32 = LLT::scalar(32);
3492   LLT S1 = LLT::scalar(1);
3493 
3494   auto Abs = B.buildFAbs(S32, RHS, Flags);
3495   const APFloat C0Val(1.0f);
3496 
3497   auto C0 = B.buildConstant(S32, 0x6f800000);
3498   auto C1 = B.buildConstant(S32, 0x2f800000);
3499   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3500 
3501   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3502   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3503 
3504   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3505 
3506   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3507     .addUse(Mul0.getReg(0))
3508     .setMIFlags(Flags);
3509 
3510   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3511 
3512   B.buildFMul(Res, Sel, Mul1, Flags);
3513 
3514   MI.eraseFromParent();
3515   return true;
3516 }
3517 
3518 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3519 // FIXME: Why do we handle this one but not other removed instructions?
3520 //
3521 // Reciprocal square root.  The clamp prevents infinite results, clamping
3522 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3523 // +-max_float.
3524 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3525                                                     MachineRegisterInfo &MRI,
3526                                                     MachineIRBuilder &B) const {
3527   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3528     return true;
3529 
3530   Register Dst = MI.getOperand(0).getReg();
3531   Register Src = MI.getOperand(2).getReg();
3532   auto Flags = MI.getFlags();
3533 
3534   LLT Ty = MRI.getType(Dst);
3535 
3536   const fltSemantics *FltSemantics;
3537   if (Ty == LLT::scalar(32))
3538     FltSemantics = &APFloat::IEEEsingle();
3539   else if (Ty == LLT::scalar(64))
3540     FltSemantics = &APFloat::IEEEdouble();
3541   else
3542     return false;
3543 
3544   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3545     .addUse(Src)
3546     .setMIFlags(Flags);
3547 
3548   // We don't need to concern ourselves with the snan handling difference, since
3549   // the rsq quieted (or not) so use the one which will directly select.
3550   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3551   const bool UseIEEE = MFI->getMode().IEEE;
3552 
3553   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3554   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3555                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3556 
3557   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3558 
3559   if (UseIEEE)
3560     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3561   else
3562     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3563   MI.eraseFromParent();
3564   return true;
3565 }
3566 
3567 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3568   switch (IID) {
3569   case Intrinsic::amdgcn_ds_fadd:
3570     return AMDGPU::G_ATOMICRMW_FADD;
3571   case Intrinsic::amdgcn_ds_fmin:
3572     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3573   case Intrinsic::amdgcn_ds_fmax:
3574     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3575   default:
3576     llvm_unreachable("not a DS FP intrinsic");
3577   }
3578 }
3579 
3580 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3581                                                       MachineInstr &MI,
3582                                                       Intrinsic::ID IID) const {
3583   GISelChangeObserver &Observer = Helper.Observer;
3584   Observer.changingInstr(MI);
3585 
3586   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3587 
3588   // The remaining operands were used to set fields in the MemOperand on
3589   // construction.
3590   for (int I = 6; I > 3; --I)
3591     MI.RemoveOperand(I);
3592 
3593   MI.RemoveOperand(1); // Remove the intrinsic ID.
3594   Observer.changedInstr(MI);
3595   return true;
3596 }
3597 
3598 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3599                                             MachineRegisterInfo &MRI,
3600                                             MachineIRBuilder &B) const {
3601   uint64_t Offset =
3602     ST.getTargetLowering()->getImplicitParameterOffset(
3603       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3604   LLT DstTy = MRI.getType(DstReg);
3605   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3606 
3607   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3608   if (!loadInputValue(KernargPtrReg, B,
3609                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3610     return false;
3611 
3612   // FIXME: This should be nuw
3613   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3614   return true;
3615 }
3616 
3617 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3618                                                  MachineRegisterInfo &MRI,
3619                                                  MachineIRBuilder &B) const {
3620   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3621   if (!MFI->isEntryFunction()) {
3622     return legalizePreloadedArgIntrin(MI, MRI, B,
3623                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3624   }
3625 
3626   Register DstReg = MI.getOperand(0).getReg();
3627   if (!getImplicitArgPtr(DstReg, MRI, B))
3628     return false;
3629 
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
3634 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3635                                               MachineRegisterInfo &MRI,
3636                                               MachineIRBuilder &B,
3637                                               unsigned AddrSpace) const {
3638   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3639   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3640   Register Hi32 = Unmerge.getReg(1);
3641 
3642   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3643   MI.eraseFromParent();
3644   return true;
3645 }
3646 
3647 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3648 // offset (the offset that is included in bounds checking and swizzling, to be
3649 // split between the instruction's voffset and immoffset fields) and soffset
3650 // (the offset that is excluded from bounds checking and swizzling, to go in
3651 // the instruction's soffset field).  This function takes the first kind of
3652 // offset and figures out how to split it between voffset and immoffset.
3653 std::tuple<Register, unsigned, unsigned>
3654 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3655                                         Register OrigOffset) const {
3656   const unsigned MaxImm = 4095;
3657   Register BaseReg;
3658   unsigned TotalConstOffset;
3659   const LLT S32 = LLT::scalar(32);
3660   MachineRegisterInfo &MRI = *B.getMRI();
3661 
3662   std::tie(BaseReg, TotalConstOffset) =
3663       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3664 
3665   unsigned ImmOffset = TotalConstOffset;
3666 
3667   // If BaseReg is a pointer, convert it to int.
3668   if (MRI.getType(BaseReg).isPointer())
3669     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3670 
3671   // If the immediate value is too big for the immoffset field, put the value
3672   // and -4096 into the immoffset field so that the value that is copied/added
3673   // for the voffset field is a multiple of 4096, and it stands more chance
3674   // of being CSEd with the copy/add for another similar load/store.
3675   // However, do not do that rounding down to a multiple of 4096 if that is a
3676   // negative number, as it appears to be illegal to have a negative offset
3677   // in the vgpr, even if adding the immediate offset makes it positive.
3678   unsigned Overflow = ImmOffset & ~MaxImm;
3679   ImmOffset -= Overflow;
3680   if ((int32_t)Overflow < 0) {
3681     Overflow += ImmOffset;
3682     ImmOffset = 0;
3683   }
3684 
3685   if (Overflow != 0) {
3686     if (!BaseReg) {
3687       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3688     } else {
3689       auto OverflowVal = B.buildConstant(S32, Overflow);
3690       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3691     }
3692   }
3693 
3694   if (!BaseReg)
3695     BaseReg = B.buildConstant(S32, 0).getReg(0);
3696 
3697   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3698 }
3699 
3700 /// Handle register layout difference for f16 images for some subtargets.
3701 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3702                                              MachineRegisterInfo &MRI,
3703                                              Register Reg,
3704                                              bool ImageStore) const {
3705   const LLT S16 = LLT::scalar(16);
3706   const LLT S32 = LLT::scalar(32);
3707   LLT StoreVT = MRI.getType(Reg);
3708   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3709 
3710   if (ST.hasUnpackedD16VMem()) {
3711     auto Unmerge = B.buildUnmerge(S16, Reg);
3712 
3713     SmallVector<Register, 4> WideRegs;
3714     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3715       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3716 
3717     int NumElts = StoreVT.getNumElements();
3718 
3719     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3720         .getReg(0);
3721   }
3722 
3723   if (ImageStore && ST.hasImageStoreD16Bug()) {
3724     if (StoreVT.getNumElements() == 2) {
3725       SmallVector<Register, 4> PackedRegs;
3726       Reg = B.buildBitcast(S32, Reg).getReg(0);
3727       PackedRegs.push_back(Reg);
3728       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3729       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3730           .getReg(0);
3731     }
3732 
3733     if (StoreVT.getNumElements() == 3) {
3734       SmallVector<Register, 4> PackedRegs;
3735       auto Unmerge = B.buildUnmerge(S16, Reg);
3736       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3737         PackedRegs.push_back(Unmerge.getReg(I));
3738       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3739       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3740       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3741     }
3742 
3743     if (StoreVT.getNumElements() == 4) {
3744       SmallVector<Register, 4> PackedRegs;
3745       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3746       auto Unmerge = B.buildUnmerge(S32, Reg);
3747       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3748         PackedRegs.push_back(Unmerge.getReg(I));
3749       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3750       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3751           .getReg(0);
3752     }
3753 
3754     llvm_unreachable("invalid data type");
3755   }
3756 
3757   return Reg;
3758 }
3759 
3760 Register AMDGPULegalizerInfo::fixStoreSourceType(
3761   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3762   MachineRegisterInfo *MRI = B.getMRI();
3763   LLT Ty = MRI->getType(VData);
3764 
3765   const LLT S16 = LLT::scalar(16);
3766 
3767   // Fixup illegal register types for i8 stores.
3768   if (Ty == LLT::scalar(8) || Ty == S16) {
3769     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3770     return AnyExt;
3771   }
3772 
3773   if (Ty.isVector()) {
3774     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3775       if (IsFormat)
3776         return handleD16VData(B, *MRI, VData);
3777     }
3778   }
3779 
3780   return VData;
3781 }
3782 
3783 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3784                                               MachineRegisterInfo &MRI,
3785                                               MachineIRBuilder &B,
3786                                               bool IsTyped,
3787                                               bool IsFormat) const {
3788   Register VData = MI.getOperand(1).getReg();
3789   LLT Ty = MRI.getType(VData);
3790   LLT EltTy = Ty.getScalarType();
3791   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3792   const LLT S32 = LLT::scalar(32);
3793 
3794   VData = fixStoreSourceType(B, VData, IsFormat);
3795   Register RSrc = MI.getOperand(2).getReg();
3796 
3797   MachineMemOperand *MMO = *MI.memoperands_begin();
3798   const int MemSize = MMO->getSize();
3799 
3800   unsigned ImmOffset;
3801   unsigned TotalOffset;
3802 
3803   // The typed intrinsics add an immediate after the registers.
3804   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3805 
3806   // The struct intrinsic variants add one additional operand over raw.
3807   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3808   Register VIndex;
3809   int OpOffset = 0;
3810   if (HasVIndex) {
3811     VIndex = MI.getOperand(3).getReg();
3812     OpOffset = 1;
3813   }
3814 
3815   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3816   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3817 
3818   unsigned Format = 0;
3819   if (IsTyped) {
3820     Format = MI.getOperand(5 + OpOffset).getImm();
3821     ++OpOffset;
3822   }
3823 
3824   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3825 
3826   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3827   if (TotalOffset != 0)
3828     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3829 
3830   unsigned Opc;
3831   if (IsTyped) {
3832     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3833                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3834   } else if (IsFormat) {
3835     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3836                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3837   } else {
3838     switch (MemSize) {
3839     case 1:
3840       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3841       break;
3842     case 2:
3843       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3844       break;
3845     default:
3846       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3847       break;
3848     }
3849   }
3850 
3851   if (!VIndex)
3852     VIndex = B.buildConstant(S32, 0).getReg(0);
3853 
3854   auto MIB = B.buildInstr(Opc)
3855     .addUse(VData)              // vdata
3856     .addUse(RSrc)               // rsrc
3857     .addUse(VIndex)             // vindex
3858     .addUse(VOffset)            // voffset
3859     .addUse(SOffset)            // soffset
3860     .addImm(ImmOffset);         // offset(imm)
3861 
3862   if (IsTyped)
3863     MIB.addImm(Format);
3864 
3865   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3866      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3867      .addMemOperand(MMO);
3868 
3869   MI.eraseFromParent();
3870   return true;
3871 }
3872 
3873 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3874                                              MachineRegisterInfo &MRI,
3875                                              MachineIRBuilder &B,
3876                                              bool IsFormat,
3877                                              bool IsTyped) const {
3878   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3879   MachineMemOperand *MMO = *MI.memoperands_begin();
3880   const LLT MemTy = MMO->getMemoryType();
3881   const LLT S32 = LLT::scalar(32);
3882 
3883   Register Dst = MI.getOperand(0).getReg();
3884   Register RSrc = MI.getOperand(2).getReg();
3885 
3886   // The typed intrinsics add an immediate after the registers.
3887   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3888 
3889   // The struct intrinsic variants add one additional operand over raw.
3890   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3891   Register VIndex;
3892   int OpOffset = 0;
3893   if (HasVIndex) {
3894     VIndex = MI.getOperand(3).getReg();
3895     OpOffset = 1;
3896   }
3897 
3898   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3899   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3900 
3901   unsigned Format = 0;
3902   if (IsTyped) {
3903     Format = MI.getOperand(5 + OpOffset).getImm();
3904     ++OpOffset;
3905   }
3906 
3907   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3908   unsigned ImmOffset;
3909   unsigned TotalOffset;
3910 
3911   LLT Ty = MRI.getType(Dst);
3912   LLT EltTy = Ty.getScalarType();
3913   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3914   const bool Unpacked = ST.hasUnpackedD16VMem();
3915 
3916   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3917   if (TotalOffset != 0)
3918     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemTy);
3919 
3920   unsigned Opc;
3921 
3922   if (IsTyped) {
3923     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3924                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3925   } else if (IsFormat) {
3926     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3927                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3928   } else {
3929     switch (MemTy.getSizeInBits()) {
3930     case 8:
3931       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3932       break;
3933     case 16:
3934       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3935       break;
3936     default:
3937       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3938       break;
3939     }
3940   }
3941 
3942   Register LoadDstReg;
3943 
3944   bool IsExtLoad =
3945       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
3946   LLT UnpackedTy = Ty.changeElementSize(32);
3947 
3948   if (IsExtLoad)
3949     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3950   else if (Unpacked && IsD16 && Ty.isVector())
3951     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3952   else
3953     LoadDstReg = Dst;
3954 
3955   if (!VIndex)
3956     VIndex = B.buildConstant(S32, 0).getReg(0);
3957 
3958   auto MIB = B.buildInstr(Opc)
3959     .addDef(LoadDstReg)         // vdata
3960     .addUse(RSrc)               // rsrc
3961     .addUse(VIndex)             // vindex
3962     .addUse(VOffset)            // voffset
3963     .addUse(SOffset)            // soffset
3964     .addImm(ImmOffset);         // offset(imm)
3965 
3966   if (IsTyped)
3967     MIB.addImm(Format);
3968 
3969   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3970      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3971      .addMemOperand(MMO);
3972 
3973   if (LoadDstReg != Dst) {
3974     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3975 
3976     // Widen result for extending loads was widened.
3977     if (IsExtLoad)
3978       B.buildTrunc(Dst, LoadDstReg);
3979     else {
3980       // Repack to original 16-bit vector result
3981       // FIXME: G_TRUNC should work, but legalization currently fails
3982       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3983       SmallVector<Register, 4> Repack;
3984       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3985         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3986       B.buildMerge(Dst, Repack);
3987     }
3988   }
3989 
3990   MI.eraseFromParent();
3991   return true;
3992 }
3993 
3994 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3995                                                MachineIRBuilder &B,
3996                                                bool IsInc) const {
3997   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3998                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3999   B.buildInstr(Opc)
4000     .addDef(MI.getOperand(0).getReg())
4001     .addUse(MI.getOperand(2).getReg())
4002     .addUse(MI.getOperand(3).getReg())
4003     .cloneMemRefs(MI);
4004   MI.eraseFromParent();
4005   return true;
4006 }
4007 
4008 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4009   switch (IntrID) {
4010   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4011   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4012     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4013   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4014   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4015     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4016   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4017   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4018     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4019   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4020   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4021     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4022   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4023   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4024     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4025   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4026   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4027     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4028   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4029   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4030     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4031   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4032   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4033     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4034   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4035   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4036     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4037   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4038   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4039     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4040   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4041   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4042     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4043   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4044   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4045     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4046   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4047   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4048     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4049   case Intrinsic::amdgcn_buffer_atomic_fadd:
4050   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4051   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4052     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4053   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4054   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4055     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4056   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4057   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4058     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4059   default:
4060     llvm_unreachable("unhandled atomic opcode");
4061   }
4062 }
4063 
4064 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4065                                                MachineIRBuilder &B,
4066                                                Intrinsic::ID IID) const {
4067   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4068                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4069   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4070 
4071   Register Dst;
4072 
4073   int OpOffset = 0;
4074   if (HasReturn) {
4075     // A few FP atomics do not support return values.
4076     Dst = MI.getOperand(0).getReg();
4077   } else {
4078     OpOffset = -1;
4079   }
4080 
4081   Register VData = MI.getOperand(2 + OpOffset).getReg();
4082   Register CmpVal;
4083 
4084   if (IsCmpSwap) {
4085     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4086     ++OpOffset;
4087   }
4088 
4089   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4090   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4091 
4092   // The struct intrinsic variants add one additional operand over raw.
4093   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4094   Register VIndex;
4095   if (HasVIndex) {
4096     VIndex = MI.getOperand(4 + OpOffset).getReg();
4097     ++OpOffset;
4098   }
4099 
4100   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4101   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4102   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4103 
4104   MachineMemOperand *MMO = *MI.memoperands_begin();
4105 
4106   unsigned ImmOffset;
4107   unsigned TotalOffset;
4108   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4109   if (TotalOffset != 0)
4110     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4111 
4112   if (!VIndex)
4113     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4114 
4115   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4116 
4117   if (HasReturn)
4118     MIB.addDef(Dst);
4119 
4120   MIB.addUse(VData); // vdata
4121 
4122   if (IsCmpSwap)
4123     MIB.addReg(CmpVal);
4124 
4125   MIB.addUse(RSrc)               // rsrc
4126      .addUse(VIndex)             // vindex
4127      .addUse(VOffset)            // voffset
4128      .addUse(SOffset)            // soffset
4129      .addImm(ImmOffset)          // offset(imm)
4130      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4131      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4132      .addMemOperand(MMO);
4133 
4134   MI.eraseFromParent();
4135   return true;
4136 }
4137 
4138 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4139 /// vector with s16 typed elements.
4140 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4141                                       SmallVectorImpl<Register> &PackedAddrs,
4142                                       unsigned ArgOffset,
4143                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4144                                       bool IsA16, bool IsG16) {
4145   const LLT S16 = LLT::scalar(16);
4146   const LLT V2S16 = LLT::fixed_vector(2, 16);
4147   auto EndIdx = Intr->VAddrEnd;
4148 
4149   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4150     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4151     if (!SrcOp.isReg())
4152       continue; // _L to _LZ may have eliminated this.
4153 
4154     Register AddrReg = SrcOp.getReg();
4155 
4156     if (I < Intr->GradientStart) {
4157       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4158       PackedAddrs.push_back(AddrReg);
4159     } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4160                (I >= Intr->CoordStart && !IsA16)) {
4161       // Handle any gradient or coordinate operands that should not be packed
4162       PackedAddrs.push_back(AddrReg);
4163     } else {
4164       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4165       // derivatives dx/dh and dx/dv are packed with undef.
4166       if (((I + 1) >= EndIdx) ||
4167           ((Intr->NumGradients / 2) % 2 == 1 &&
4168            (I == static_cast<unsigned>(Intr->GradientStart +
4169                                        (Intr->NumGradients / 2) - 1) ||
4170             I == static_cast<unsigned>(Intr->GradientStart +
4171                                        Intr->NumGradients - 1))) ||
4172           // Check for _L to _LZ optimization
4173           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4174         PackedAddrs.push_back(
4175             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4176                 .getReg(0));
4177       } else {
4178         PackedAddrs.push_back(
4179             B.buildBuildVector(
4180                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4181                 .getReg(0));
4182         ++I;
4183       }
4184     }
4185   }
4186 }
4187 
4188 /// Convert from separate vaddr components to a single vector address register,
4189 /// and replace the remaining operands with $noreg.
4190 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4191                                      int DimIdx, int NumVAddrs) {
4192   const LLT S32 = LLT::scalar(32);
4193 
4194   SmallVector<Register, 8> AddrRegs;
4195   for (int I = 0; I != NumVAddrs; ++I) {
4196     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4197     if (SrcOp.isReg()) {
4198       AddrRegs.push_back(SrcOp.getReg());
4199       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4200     }
4201   }
4202 
4203   int NumAddrRegs = AddrRegs.size();
4204   if (NumAddrRegs != 1) {
4205     // Round up to 8 elements for v5-v7
4206     // FIXME: Missing intermediate sized register classes and instructions.
4207     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4208       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4209       auto Undef = B.buildUndef(S32);
4210       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4211       NumAddrRegs = RoundedNumRegs;
4212     }
4213 
4214     auto VAddr =
4215         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4216     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4217   }
4218 
4219   for (int I = 1; I != NumVAddrs; ++I) {
4220     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4221     if (SrcOp.isReg())
4222       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4223   }
4224 }
4225 
4226 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4227 ///
4228 /// Depending on the subtarget, load/store with 16-bit element data need to be
4229 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4230 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4231 /// registers.
4232 ///
4233 /// We don't want to directly select image instructions just yet, but also want
4234 /// to exposes all register repacking to the legalizer/combiners. We also don't
4235 /// want a selected instrution entering RegBankSelect. In order to avoid
4236 /// defining a multitude of intermediate image instructions, directly hack on
4237 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4238 /// now unnecessary arguments with $noreg.
4239 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4240     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4241     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4242 
4243   const unsigned NumDefs = MI.getNumExplicitDefs();
4244   const unsigned ArgOffset = NumDefs + 1;
4245   bool IsTFE = NumDefs == 2;
4246   // We are only processing the operands of d16 image operations on subtargets
4247   // that use the unpacked register layout, or need to repack the TFE result.
4248 
4249   // TODO: Do we need to guard against already legalized intrinsics?
4250   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4251       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4252 
4253   MachineRegisterInfo *MRI = B.getMRI();
4254   const LLT S32 = LLT::scalar(32);
4255   const LLT S16 = LLT::scalar(16);
4256   const LLT V2S16 = LLT::fixed_vector(2, 16);
4257 
4258   unsigned DMask = 0;
4259 
4260   // Check for 16 bit addresses and pack if true.
4261   LLT GradTy =
4262       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4263   LLT AddrTy =
4264       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4265   const bool IsG16 = GradTy == S16;
4266   const bool IsA16 = AddrTy == S16;
4267 
4268   int DMaskLanes = 0;
4269   if (!BaseOpcode->Atomic) {
4270     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4271     if (BaseOpcode->Gather4) {
4272       DMaskLanes = 4;
4273     } else if (DMask != 0) {
4274       DMaskLanes = countPopulation(DMask);
4275     } else if (!IsTFE && !BaseOpcode->Store) {
4276       // If dmask is 0, this is a no-op load. This can be eliminated.
4277       B.buildUndef(MI.getOperand(0));
4278       MI.eraseFromParent();
4279       return true;
4280     }
4281   }
4282 
4283   Observer.changingInstr(MI);
4284   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4285 
4286   unsigned NewOpcode = NumDefs == 0 ?
4287     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4288 
4289   // Track that we legalized this
4290   MI.setDesc(B.getTII().get(NewOpcode));
4291 
4292   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4293   // dmask to be at least 1 otherwise the instruction will fail
4294   if (IsTFE && DMask == 0) {
4295     DMask = 0x1;
4296     DMaskLanes = 1;
4297     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4298   }
4299 
4300   if (BaseOpcode->Atomic) {
4301     Register VData0 = MI.getOperand(2).getReg();
4302     LLT Ty = MRI->getType(VData0);
4303 
4304     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4305     if (Ty.isVector())
4306       return false;
4307 
4308     if (BaseOpcode->AtomicX2) {
4309       Register VData1 = MI.getOperand(3).getReg();
4310       // The two values are packed in one register.
4311       LLT PackedTy = LLT::fixed_vector(2, Ty);
4312       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4313       MI.getOperand(2).setReg(Concat.getReg(0));
4314       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4315     }
4316   }
4317 
4318   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4319 
4320   // Optimize _L to _LZ when _L is zero
4321   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4322           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4323     const ConstantFP *ConstantLod;
4324 
4325     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4326                  m_GFCst(ConstantLod))) {
4327       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4328         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4329         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4330             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4331                                                       Intr->Dim);
4332 
4333         // The starting indexes should remain in the same place.
4334         --CorrectedNumVAddrs;
4335 
4336         MI.getOperand(MI.getNumExplicitDefs())
4337             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4338         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4339         Intr = NewImageDimIntr;
4340       }
4341     }
4342   }
4343 
4344   // Optimize _mip away, when 'lod' is zero
4345   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4346     int64_t ConstantLod;
4347     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4348                  m_ICst(ConstantLod))) {
4349       if (ConstantLod == 0) {
4350         // TODO: Change intrinsic opcode and remove operand instead or replacing
4351         // it with 0, as the _L to _LZ handling is done above.
4352         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4353         --CorrectedNumVAddrs;
4354       }
4355     }
4356   }
4357 
4358   // Rewrite the addressing register layout before doing anything else.
4359   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4360     // 16 bit gradients are supported, but are tied to the A16 control
4361     // so both gradients and addresses must be 16 bit
4362     return false;
4363   }
4364 
4365   if (IsA16 && !ST.hasA16()) {
4366     // A16 not supported
4367     return false;
4368   }
4369 
4370   if (IsA16 || IsG16) {
4371     if (Intr->NumVAddrs > 1) {
4372       SmallVector<Register, 4> PackedRegs;
4373 
4374       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4375                                 IsG16);
4376 
4377       // See also below in the non-a16 branch
4378       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4379 
4380       if (!UseNSA && PackedRegs.size() > 1) {
4381         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4382         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4383         PackedRegs[0] = Concat.getReg(0);
4384         PackedRegs.resize(1);
4385       }
4386 
4387       const unsigned NumPacked = PackedRegs.size();
4388       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4389         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4390         if (!SrcOp.isReg()) {
4391           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4392           continue;
4393         }
4394 
4395         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4396 
4397         if (I - Intr->VAddrStart < NumPacked)
4398           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4399         else
4400           SrcOp.setReg(AMDGPU::NoRegister);
4401       }
4402     }
4403   } else {
4404     // If the register allocator cannot place the address registers contiguously
4405     // without introducing moves, then using the non-sequential address encoding
4406     // is always preferable, since it saves VALU instructions and is usually a
4407     // wash in terms of code size or even better.
4408     //
4409     // However, we currently have no way of hinting to the register allocator
4410     // that MIMG addresses should be placed contiguously when it is possible to
4411     // do so, so force non-NSA for the common 2-address case as a heuristic.
4412     //
4413     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4414     // allocation when possible.
4415     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4416 
4417     if (!UseNSA && Intr->NumVAddrs > 1)
4418       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4419                                Intr->NumVAddrs);
4420   }
4421 
4422   int Flags = 0;
4423   if (IsA16)
4424     Flags |= 1;
4425   if (IsG16)
4426     Flags |= 2;
4427   MI.addOperand(MachineOperand::CreateImm(Flags));
4428 
4429   if (BaseOpcode->Store) { // No TFE for stores?
4430     // TODO: Handle dmask trim
4431     Register VData = MI.getOperand(1).getReg();
4432     LLT Ty = MRI->getType(VData);
4433     if (!Ty.isVector() || Ty.getElementType() != S16)
4434       return true;
4435 
4436     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4437     if (RepackedReg != VData) {
4438       MI.getOperand(1).setReg(RepackedReg);
4439     }
4440 
4441     return true;
4442   }
4443 
4444   Register DstReg = MI.getOperand(0).getReg();
4445   LLT Ty = MRI->getType(DstReg);
4446   const LLT EltTy = Ty.getScalarType();
4447   const bool IsD16 = Ty.getScalarType() == S16;
4448   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4449 
4450   // Confirm that the return type is large enough for the dmask specified
4451   if (NumElts < DMaskLanes)
4452     return false;
4453 
4454   if (NumElts > 4 || DMaskLanes > 4)
4455     return false;
4456 
4457   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4458   const LLT AdjustedTy =
4459       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4460 
4461   // The raw dword aligned data component of the load. The only legal cases
4462   // where this matters should be when using the packed D16 format, for
4463   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4464   LLT RoundedTy;
4465 
4466   // S32 vector to to cover all data, plus TFE result element.
4467   LLT TFETy;
4468 
4469   // Register type to use for each loaded component. Will be S32 or V2S16.
4470   LLT RegTy;
4471 
4472   if (IsD16 && ST.hasUnpackedD16VMem()) {
4473     RoundedTy =
4474         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4475     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4476     RegTy = S32;
4477   } else {
4478     unsigned EltSize = EltTy.getSizeInBits();
4479     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4480     unsigned RoundedSize = 32 * RoundedElts;
4481     RoundedTy = LLT::scalarOrVector(
4482         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4483     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4484     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4485   }
4486 
4487   // The return type does not need adjustment.
4488   // TODO: Should we change s16 case to s32 or <2 x s16>?
4489   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4490     return true;
4491 
4492   Register Dst1Reg;
4493 
4494   // Insert after the instruction.
4495   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4496 
4497   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4498   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4499   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4500   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4501 
4502   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4503 
4504   MI.getOperand(0).setReg(NewResultReg);
4505 
4506   // In the IR, TFE is supposed to be used with a 2 element struct return
4507   // type. The intruction really returns these two values in one contiguous
4508   // register, with one additional dword beyond the loaded data. Rewrite the
4509   // return type to use a single register result.
4510 
4511   if (IsTFE) {
4512     Dst1Reg = MI.getOperand(1).getReg();
4513     if (MRI->getType(Dst1Reg) != S32)
4514       return false;
4515 
4516     // TODO: Make sure the TFE operand bit is set.
4517     MI.RemoveOperand(1);
4518 
4519     // Handle the easy case that requires no repack instructions.
4520     if (Ty == S32) {
4521       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4522       return true;
4523     }
4524   }
4525 
4526   // Now figure out how to copy the new result register back into the old
4527   // result.
4528   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4529 
4530   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4531 
4532   if (ResultNumRegs == 1) {
4533     assert(!IsTFE);
4534     ResultRegs[0] = NewResultReg;
4535   } else {
4536     // We have to repack into a new vector of some kind.
4537     for (int I = 0; I != NumDataRegs; ++I)
4538       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4539     B.buildUnmerge(ResultRegs, NewResultReg);
4540 
4541     // Drop the final TFE element to get the data part. The TFE result is
4542     // directly written to the right place already.
4543     if (IsTFE)
4544       ResultRegs.resize(NumDataRegs);
4545   }
4546 
4547   // For an s16 scalar result, we form an s32 result with a truncate regardless
4548   // of packed vs. unpacked.
4549   if (IsD16 && !Ty.isVector()) {
4550     B.buildTrunc(DstReg, ResultRegs[0]);
4551     return true;
4552   }
4553 
4554   // Avoid a build/concat_vector of 1 entry.
4555   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4556     B.buildBitcast(DstReg, ResultRegs[0]);
4557     return true;
4558   }
4559 
4560   assert(Ty.isVector());
4561 
4562   if (IsD16) {
4563     // For packed D16 results with TFE enabled, all the data components are
4564     // S32. Cast back to the expected type.
4565     //
4566     // TODO: We don't really need to use load s32 elements. We would only need one
4567     // cast for the TFE result if a multiple of v2s16 was used.
4568     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4569       for (Register &Reg : ResultRegs)
4570         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4571     } else if (ST.hasUnpackedD16VMem()) {
4572       for (Register &Reg : ResultRegs)
4573         Reg = B.buildTrunc(S16, Reg).getReg(0);
4574     }
4575   }
4576 
4577   auto padWithUndef = [&](LLT Ty, int NumElts) {
4578     if (NumElts == 0)
4579       return;
4580     Register Undef = B.buildUndef(Ty).getReg(0);
4581     for (int I = 0; I != NumElts; ++I)
4582       ResultRegs.push_back(Undef);
4583   };
4584 
4585   // Pad out any elements eliminated due to the dmask.
4586   LLT ResTy = MRI->getType(ResultRegs[0]);
4587   if (!ResTy.isVector()) {
4588     padWithUndef(ResTy, NumElts - ResultRegs.size());
4589     B.buildBuildVector(DstReg, ResultRegs);
4590     return true;
4591   }
4592 
4593   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4594   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4595 
4596   // Deal with the one annoying legal case.
4597   const LLT V3S16 = LLT::fixed_vector(3, 16);
4598   if (Ty == V3S16) {
4599     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4600     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4601     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4602     return true;
4603   }
4604 
4605   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4606   B.buildConcatVectors(DstReg, ResultRegs);
4607   return true;
4608 }
4609 
4610 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4611   LegalizerHelper &Helper, MachineInstr &MI) const {
4612   MachineIRBuilder &B = Helper.MIRBuilder;
4613   GISelChangeObserver &Observer = Helper.Observer;
4614 
4615   Register Dst = MI.getOperand(0).getReg();
4616   LLT Ty = B.getMRI()->getType(Dst);
4617   unsigned Size = Ty.getSizeInBits();
4618   MachineFunction &MF = B.getMF();
4619 
4620   Observer.changingInstr(MI);
4621 
4622   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4623     Ty = getBitcastRegisterType(Ty);
4624     Helper.bitcastDst(MI, Ty, 0);
4625     Dst = MI.getOperand(0).getReg();
4626     B.setInsertPt(B.getMBB(), MI);
4627   }
4628 
4629   // FIXME: We don't really need this intermediate instruction. The intrinsic
4630   // should be fixed to have a memory operand. Since it's readnone, we're not
4631   // allowed to add one.
4632   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4633   MI.RemoveOperand(1); // Remove intrinsic ID
4634 
4635   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4636   // TODO: Should this use datalayout alignment?
4637   const unsigned MemSize = (Size + 7) / 8;
4638   const Align MemAlign(4);
4639   MachineMemOperand *MMO = MF.getMachineMemOperand(
4640       MachinePointerInfo(),
4641       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4642           MachineMemOperand::MOInvariant,
4643       MemSize, MemAlign);
4644   MI.addMemOperand(MF, MMO);
4645 
4646   // There are no 96-bit result scalar loads, but widening to 128-bit should
4647   // always be legal. We may need to restore this to a 96-bit result if it turns
4648   // out this needs to be converted to a vector load during RegBankSelect.
4649   if (!isPowerOf2_32(Size)) {
4650     if (Ty.isVector())
4651       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4652     else
4653       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4654   }
4655 
4656   Observer.changedInstr(MI);
4657   return true;
4658 }
4659 
4660 // TODO: Move to selection
4661 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4662                                                 MachineRegisterInfo &MRI,
4663                                                 MachineIRBuilder &B) const {
4664   if (!ST.isTrapHandlerEnabled() ||
4665       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4666     return legalizeTrapEndpgm(MI, MRI, B);
4667 
4668   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4669     switch (*HsaAbiVer) {
4670     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4671     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4672       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4673     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4674       return ST.supportsGetDoorbellID() ?
4675           legalizeTrapHsa(MI, MRI, B) :
4676           legalizeTrapHsaQueuePtr(MI, MRI, B);
4677     }
4678   }
4679 
4680   llvm_unreachable("Unknown trap handler");
4681 }
4682 
4683 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4684     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4685   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4686   MI.eraseFromParent();
4687   return true;
4688 }
4689 
4690 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4691     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4692   // Pass queue pointer to trap handler as input, and insert trap instruction
4693   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4694   Register LiveIn =
4695     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4696   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4697     return false;
4698 
4699   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4700   B.buildCopy(SGPR01, LiveIn);
4701   B.buildInstr(AMDGPU::S_TRAP)
4702       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4703       .addReg(SGPR01, RegState::Implicit);
4704 
4705   MI.eraseFromParent();
4706   return true;
4707 }
4708 
4709 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4710     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4711   B.buildInstr(AMDGPU::S_TRAP)
4712       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4713   MI.eraseFromParent();
4714   return true;
4715 }
4716 
4717 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4718     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4719   // Is non-HSA path or trap-handler disabled? then, report a warning
4720   // accordingly
4721   if (!ST.isTrapHandlerEnabled() ||
4722       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4723     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4724                                      "debugtrap handler not supported",
4725                                      MI.getDebugLoc(), DS_Warning);
4726     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4727     Ctx.diagnose(NoTrap);
4728   } else {
4729     // Insert debug-trap instruction
4730     B.buildInstr(AMDGPU::S_TRAP)
4731         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4732   }
4733 
4734   MI.eraseFromParent();
4735   return true;
4736 }
4737 
4738 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4739                                                MachineIRBuilder &B) const {
4740   MachineRegisterInfo &MRI = *B.getMRI();
4741   const LLT S16 = LLT::scalar(16);
4742   const LLT S32 = LLT::scalar(32);
4743 
4744   Register DstReg = MI.getOperand(0).getReg();
4745   Register NodePtr = MI.getOperand(2).getReg();
4746   Register RayExtent = MI.getOperand(3).getReg();
4747   Register RayOrigin = MI.getOperand(4).getReg();
4748   Register RayDir = MI.getOperand(5).getReg();
4749   Register RayInvDir = MI.getOperand(6).getReg();
4750   Register TDescr = MI.getOperand(7).getReg();
4751 
4752   if (!ST.hasGFX10_AEncoding()) {
4753     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4754                                         "intrinsic not supported on subtarget",
4755                                         MI.getDebugLoc());
4756     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4757     return false;
4758   }
4759 
4760   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4761   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4762   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4763                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4764                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4765                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4766 
4767   SmallVector<Register, 12> Ops;
4768   if (Is64) {
4769     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4770     Ops.push_back(Unmerge.getReg(0));
4771     Ops.push_back(Unmerge.getReg(1));
4772   } else {
4773     Ops.push_back(NodePtr);
4774   }
4775   Ops.push_back(RayExtent);
4776 
4777   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4778     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4779     Ops.push_back(Unmerge.getReg(0));
4780     Ops.push_back(Unmerge.getReg(1));
4781     Ops.push_back(Unmerge.getReg(2));
4782   };
4783 
4784   packLanes(RayOrigin);
4785   if (IsA16) {
4786     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4787     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4788     Register R1 = MRI.createGenericVirtualRegister(S32);
4789     Register R2 = MRI.createGenericVirtualRegister(S32);
4790     Register R3 = MRI.createGenericVirtualRegister(S32);
4791     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4792     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4793     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4794     Ops.push_back(R1);
4795     Ops.push_back(R2);
4796     Ops.push_back(R3);
4797   } else {
4798     packLanes(RayDir);
4799     packLanes(RayInvDir);
4800   }
4801 
4802   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4803     .addDef(DstReg)
4804     .addImm(Opcode);
4805 
4806   for (Register R : Ops) {
4807     MIB.addUse(R);
4808   }
4809 
4810   MIB.addUse(TDescr)
4811      .addImm(IsA16 ? 1 : 0)
4812      .cloneMemRefs(MI);
4813 
4814   MI.eraseFromParent();
4815   return true;
4816 }
4817 
4818 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4819                                             MachineInstr &MI) const {
4820   MachineIRBuilder &B = Helper.MIRBuilder;
4821   MachineRegisterInfo &MRI = *B.getMRI();
4822 
4823   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4824   auto IntrID = MI.getIntrinsicID();
4825   switch (IntrID) {
4826   case Intrinsic::amdgcn_if:
4827   case Intrinsic::amdgcn_else: {
4828     MachineInstr *Br = nullptr;
4829     MachineBasicBlock *UncondBrTarget = nullptr;
4830     bool Negated = false;
4831     if (MachineInstr *BrCond =
4832             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4833       const SIRegisterInfo *TRI
4834         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4835 
4836       Register Def = MI.getOperand(1).getReg();
4837       Register Use = MI.getOperand(3).getReg();
4838 
4839       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4840 
4841       if (Negated)
4842         std::swap(CondBrTarget, UncondBrTarget);
4843 
4844       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4845       if (IntrID == Intrinsic::amdgcn_if) {
4846         B.buildInstr(AMDGPU::SI_IF)
4847           .addDef(Def)
4848           .addUse(Use)
4849           .addMBB(UncondBrTarget);
4850       } else {
4851         B.buildInstr(AMDGPU::SI_ELSE)
4852             .addDef(Def)
4853             .addUse(Use)
4854             .addMBB(UncondBrTarget);
4855       }
4856 
4857       if (Br) {
4858         Br->getOperand(0).setMBB(CondBrTarget);
4859       } else {
4860         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4861         // since we're swapping branch targets it needs to be reinserted.
4862         // FIXME: IRTranslator should probably not do this
4863         B.buildBr(*CondBrTarget);
4864       }
4865 
4866       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4867       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4868       MI.eraseFromParent();
4869       BrCond->eraseFromParent();
4870       return true;
4871     }
4872 
4873     return false;
4874   }
4875   case Intrinsic::amdgcn_loop: {
4876     MachineInstr *Br = nullptr;
4877     MachineBasicBlock *UncondBrTarget = nullptr;
4878     bool Negated = false;
4879     if (MachineInstr *BrCond =
4880             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4881       const SIRegisterInfo *TRI
4882         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4883 
4884       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4885       Register Reg = MI.getOperand(2).getReg();
4886 
4887       if (Negated)
4888         std::swap(CondBrTarget, UncondBrTarget);
4889 
4890       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4891       B.buildInstr(AMDGPU::SI_LOOP)
4892         .addUse(Reg)
4893         .addMBB(UncondBrTarget);
4894 
4895       if (Br)
4896         Br->getOperand(0).setMBB(CondBrTarget);
4897       else
4898         B.buildBr(*CondBrTarget);
4899 
4900       MI.eraseFromParent();
4901       BrCond->eraseFromParent();
4902       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4903       return true;
4904     }
4905 
4906     return false;
4907   }
4908   case Intrinsic::amdgcn_kernarg_segment_ptr:
4909     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4910       // This only makes sense to call in a kernel, so just lower to null.
4911       B.buildConstant(MI.getOperand(0).getReg(), 0);
4912       MI.eraseFromParent();
4913       return true;
4914     }
4915 
4916     return legalizePreloadedArgIntrin(
4917       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4918   case Intrinsic::amdgcn_implicitarg_ptr:
4919     return legalizeImplicitArgPtr(MI, MRI, B);
4920   case Intrinsic::amdgcn_workitem_id_x:
4921     return legalizePreloadedArgIntrin(MI, MRI, B,
4922                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4923   case Intrinsic::amdgcn_workitem_id_y:
4924     return legalizePreloadedArgIntrin(MI, MRI, B,
4925                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4926   case Intrinsic::amdgcn_workitem_id_z:
4927     return legalizePreloadedArgIntrin(MI, MRI, B,
4928                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4929   case Intrinsic::amdgcn_workgroup_id_x:
4930     return legalizePreloadedArgIntrin(MI, MRI, B,
4931                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4932   case Intrinsic::amdgcn_workgroup_id_y:
4933     return legalizePreloadedArgIntrin(MI, MRI, B,
4934                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4935   case Intrinsic::amdgcn_workgroup_id_z:
4936     return legalizePreloadedArgIntrin(MI, MRI, B,
4937                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4938   case Intrinsic::amdgcn_dispatch_ptr:
4939     return legalizePreloadedArgIntrin(MI, MRI, B,
4940                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4941   case Intrinsic::amdgcn_queue_ptr:
4942     return legalizePreloadedArgIntrin(MI, MRI, B,
4943                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4944   case Intrinsic::amdgcn_implicit_buffer_ptr:
4945     return legalizePreloadedArgIntrin(
4946       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4947   case Intrinsic::amdgcn_dispatch_id:
4948     return legalizePreloadedArgIntrin(MI, MRI, B,
4949                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4950   case Intrinsic::amdgcn_fdiv_fast:
4951     return legalizeFDIVFastIntrin(MI, MRI, B);
4952   case Intrinsic::amdgcn_is_shared:
4953     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4954   case Intrinsic::amdgcn_is_private:
4955     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4956   case Intrinsic::amdgcn_wavefrontsize: {
4957     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4958     MI.eraseFromParent();
4959     return true;
4960   }
4961   case Intrinsic::amdgcn_s_buffer_load:
4962     return legalizeSBufferLoad(Helper, MI);
4963   case Intrinsic::amdgcn_raw_buffer_store:
4964   case Intrinsic::amdgcn_struct_buffer_store:
4965     return legalizeBufferStore(MI, MRI, B, false, false);
4966   case Intrinsic::amdgcn_raw_buffer_store_format:
4967   case Intrinsic::amdgcn_struct_buffer_store_format:
4968     return legalizeBufferStore(MI, MRI, B, false, true);
4969   case Intrinsic::amdgcn_raw_tbuffer_store:
4970   case Intrinsic::amdgcn_struct_tbuffer_store:
4971     return legalizeBufferStore(MI, MRI, B, true, true);
4972   case Intrinsic::amdgcn_raw_buffer_load:
4973   case Intrinsic::amdgcn_struct_buffer_load:
4974     return legalizeBufferLoad(MI, MRI, B, false, false);
4975   case Intrinsic::amdgcn_raw_buffer_load_format:
4976   case Intrinsic::amdgcn_struct_buffer_load_format:
4977     return legalizeBufferLoad(MI, MRI, B, true, false);
4978   case Intrinsic::amdgcn_raw_tbuffer_load:
4979   case Intrinsic::amdgcn_struct_tbuffer_load:
4980     return legalizeBufferLoad(MI, MRI, B, true, true);
4981   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4982   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4983   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4984   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4985   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4986   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4987   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4988   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4989   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4990   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4991   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4992   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4993   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4994   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4995   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4996   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4997   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4998   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4999   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5000   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5001   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5002   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5003   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5004   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5005   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5006   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5007   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5008   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5009   case Intrinsic::amdgcn_buffer_atomic_fadd:
5010   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5011   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5012   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5013   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5014     return legalizeBufferAtomic(MI, B, IntrID);
5015   case Intrinsic::amdgcn_atomic_inc:
5016     return legalizeAtomicIncDec(MI, B, true);
5017   case Intrinsic::amdgcn_atomic_dec:
5018     return legalizeAtomicIncDec(MI, B, false);
5019   case Intrinsic::trap:
5020     return legalizeTrapIntrinsic(MI, MRI, B);
5021   case Intrinsic::debugtrap:
5022     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5023   case Intrinsic::amdgcn_rsq_clamp:
5024     return legalizeRsqClampIntrinsic(MI, MRI, B);
5025   case Intrinsic::amdgcn_ds_fadd:
5026   case Intrinsic::amdgcn_ds_fmin:
5027   case Intrinsic::amdgcn_ds_fmax:
5028     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5029   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5030     return legalizeBVHIntrinsic(MI, B);
5031   default: {
5032     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5033             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5034       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5035     return true;
5036   }
5037   }
5038 
5039   return true;
5040 }
5041