1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx,
100                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(
112         TypeIdx,
113         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136   const unsigned Size = Ty.getSizeInBits();
137 
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB})
534       .legalFor({S32, S16, V2S16})
535       .clampMaxNumElementsStrict(0, S16, 2)
536       .scalarize(0)
537       .minScalar(0, S16)
538       .widenScalarToNextMultipleOf(0, 32)
539       .maxScalar(0, S32);
540 
541     getActionDefinitionsBuilder(G_MUL)
542       .legalFor({S32, S16, V2S16})
543       .clampMaxNumElementsStrict(0, S16, 2)
544       .scalarize(0)
545       .minScalar(0, S16)
546       .widenScalarToNextMultipleOf(0, 32)
547       .custom();
548     assert(ST.hasMad64_32());
549 
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551       .legalFor({S32, S16, V2S16}) // Clamp modifier
552       .minScalarOrElt(0, S16)
553       .clampMaxNumElementsStrict(0, S16, 2)
554       .scalarize(0)
555       .widenScalarToNextPow2(0, 32)
556       .lower();
557   } else if (ST.has16BitInsts()) {
558     getActionDefinitionsBuilder({G_ADD, G_SUB})
559       .legalFor({S32, S16})
560       .minScalar(0, S16)
561       .widenScalarToNextMultipleOf(0, 32)
562       .maxScalar(0, S32)
563       .scalarize(0);
564 
565     getActionDefinitionsBuilder(G_MUL)
566       .legalFor({S32, S16})
567       .scalarize(0)
568       .minScalar(0, S16)
569       .widenScalarToNextMultipleOf(0, 32)
570       .custom();
571     assert(ST.hasMad64_32());
572 
573     // Technically the saturating operations require clamp bit support, but this
574     // was introduced at the same time as 16-bit operations.
575     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576       .legalFor({S32, S16}) // Clamp modifier
577       .minScalar(0, S16)
578       .scalarize(0)
579       .widenScalarToNextPow2(0, 16)
580       .lower();
581 
582     // We're just lowering this, but it helps get a better result to try to
583     // coerce to the desired type first.
584     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585       .minScalar(0, S16)
586       .scalarize(0)
587       .lower();
588   } else {
589     getActionDefinitionsBuilder({G_ADD, G_SUB})
590       .legalFor({S32})
591       .widenScalarToNextMultipleOf(0, 32)
592       .clampScalar(0, S32, S32)
593       .scalarize(0);
594 
595     auto &Mul = getActionDefinitionsBuilder(G_MUL)
596       .legalFor({S32})
597       .scalarize(0)
598       .minScalar(0, S32)
599       .widenScalarToNextMultipleOf(0, 32);
600 
601     if (ST.hasMad64_32())
602       Mul.custom();
603     else
604       Mul.maxScalar(0, S32);
605 
606     if (ST.hasIntClamp()) {
607       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608         .legalFor({S32}) // Clamp modifier.
609         .scalarize(0)
610         .minScalarOrElt(0, S32)
611         .lower();
612     } else {
613       // Clamp bit support was added in VI, along with 16-bit operations.
614       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615         .minScalar(0, S32)
616         .scalarize(0)
617         .lower();
618     }
619 
620     // FIXME: DAG expansion gets better results. The widening uses the smaller
621     // range values and goes for the min/max lowering directly.
622     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623       .minScalar(0, S32)
624       .scalarize(0)
625       .lower();
626   }
627 
628   getActionDefinitionsBuilder(
629       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630       .customFor({S32, S64})
631       .clampScalar(0, S32, S64)
632       .widenScalarToNextPow2(0, 32)
633       .scalarize(0);
634 
635   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636                    .legalFor({S32})
637                    .maxScalar(0, S32);
638 
639   if (ST.hasVOP3PInsts()) {
640     Mulh
641       .clampMaxNumElements(0, S8, 2)
642       .lowerFor({V2S8});
643   }
644 
645   Mulh
646     .scalarize(0)
647     .lower();
648 
649   // Report legal for any types we can handle anywhere. For the cases only legal
650   // on the SALU, RegBankSelect will be able to re-legalize.
651   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653     .clampScalar(0, S32, S64)
654     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
655     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
656     .widenScalarToNextPow2(0)
657     .scalarize(0);
658 
659   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661     .legalFor({{S32, S1}, {S32, S32}})
662     .minScalar(0, S32)
663     .scalarize(0)
664     .lower();
665 
666   getActionDefinitionsBuilder(G_BITCAST)
667     // Don't worry about the size constraint.
668     .legalIf(all(isRegisterType(0), isRegisterType(1)))
669     .lower();
670 
671 
672   getActionDefinitionsBuilder(G_CONSTANT)
673     .legalFor({S1, S32, S64, S16, GlobalPtr,
674                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675     .legalIf(isPointer(0))
676     .clampScalar(0, S32, S64)
677     .widenScalarToNextPow2(0);
678 
679   getActionDefinitionsBuilder(G_FCONSTANT)
680     .legalFor({S32, S64, S16})
681     .clampScalar(0, S16, S64);
682 
683   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684       .legalIf(isRegisterType(0))
685       // s1 and s16 are special cases because they have legal operations on
686       // them, but don't really occupy registers in the normal way.
687       .legalFor({S1, S16})
688       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689       .clampScalarOrElt(0, S32, MaxScalar)
690       .widenScalarToNextPow2(0, 32)
691       .clampMaxNumElements(0, S32, 16);
692 
693   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694 
695   // If the amount is divergent, we have to do a wave reduction to get the
696   // maximum value, so this is expanded during RegBankSelect.
697   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698     .legalFor({{PrivatePtr, S32}});
699 
700   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701     .customIf(typeIsNot(0, PrivatePtr));
702 
703   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704 
705   auto &FPOpActions = getActionDefinitionsBuilder(
706     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707     .legalFor({S32, S64});
708   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709     .customFor({S32, S64});
710   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711     .customFor({S32, S64});
712 
713   if (ST.has16BitInsts()) {
714     if (ST.hasVOP3PInsts())
715       FPOpActions.legalFor({S16, V2S16});
716     else
717       FPOpActions.legalFor({S16});
718 
719     TrigActions.customFor({S16});
720     FDIVActions.customFor({S16});
721   }
722 
723   auto &MinNumMaxNum = getActionDefinitionsBuilder({
724       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725 
726   if (ST.hasVOP3PInsts()) {
727     MinNumMaxNum.customFor(FPTypesPK16)
728       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
729       .clampMaxNumElements(0, S16, 2)
730       .clampScalar(0, S16, S64)
731       .scalarize(0);
732   } else if (ST.has16BitInsts()) {
733     MinNumMaxNum.customFor(FPTypes16)
734       .clampScalar(0, S16, S64)
735       .scalarize(0);
736   } else {
737     MinNumMaxNum.customFor(FPTypesBase)
738       .clampScalar(0, S32, S64)
739       .scalarize(0);
740   }
741 
742   if (ST.hasVOP3PInsts())
743     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744 
745   FPOpActions
746     .scalarize(0)
747     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748 
749   TrigActions
750     .scalarize(0)
751     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752 
753   FDIVActions
754     .scalarize(0)
755     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756 
757   getActionDefinitionsBuilder({G_FNEG, G_FABS})
758     .legalFor(FPTypesPK16)
759     .clampMaxNumElementsStrict(0, S16, 2)
760     .scalarize(0)
761     .clampScalar(0, S16, S64);
762 
763   if (ST.has16BitInsts()) {
764     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765       .legalFor({S32, S64, S16})
766       .scalarize(0)
767       .clampScalar(0, S16, S64);
768   } else {
769     getActionDefinitionsBuilder(G_FSQRT)
770       .legalFor({S32, S64})
771       .scalarize(0)
772       .clampScalar(0, S32, S64);
773 
774     if (ST.hasFractBug()) {
775       getActionDefinitionsBuilder(G_FFLOOR)
776         .customFor({S64})
777         .legalFor({S32, S64})
778         .scalarize(0)
779         .clampScalar(0, S32, S64);
780     } else {
781       getActionDefinitionsBuilder(G_FFLOOR)
782         .legalFor({S32, S64})
783         .scalarize(0)
784         .clampScalar(0, S32, S64);
785     }
786   }
787 
788   getActionDefinitionsBuilder(G_FPTRUNC)
789     .legalFor({{S32, S64}, {S16, S32}})
790     .scalarize(0)
791     .lower();
792 
793   getActionDefinitionsBuilder(G_FPEXT)
794     .legalFor({{S64, S32}, {S32, S16}})
795     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796     .scalarize(0);
797 
798   getActionDefinitionsBuilder(G_FSUB)
799       // Use actual fsub instruction
800       .legalFor({S32})
801       // Must use fadd + fneg
802       .lowerFor({S64, S16, V2S16})
803       .scalarize(0)
804       .clampScalar(0, S32, S64);
805 
806   // Whether this is legal depends on the floating point mode for the function.
807   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
808   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
809     FMad.customFor({S32, S16});
810   else if (ST.hasMadMacF32Insts())
811     FMad.customFor({S32});
812   else if (ST.hasMadF16())
813     FMad.customFor({S16});
814   FMad.scalarize(0)
815       .lower();
816 
817   auto &FRem = getActionDefinitionsBuilder(G_FREM);
818   if (ST.has16BitInsts()) {
819     FRem.customFor({S16, S32, S64});
820   } else {
821     FRem.minScalar(0, S32)
822         .customFor({S32, S64});
823   }
824   FRem.scalarize(0);
825 
826   // TODO: Do we need to clamp maximum bitwidth?
827   getActionDefinitionsBuilder(G_TRUNC)
828     .legalIf(isScalar(0))
829     .legalFor({{V2S16, V2S32}})
830     .clampMaxNumElements(0, S16, 2)
831     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
832     // situations (like an invalid implicit use), we don't want to infinite loop
833     // in the legalizer.
834     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
835     .alwaysLegal();
836 
837   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
838     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
839                {S32, S1}, {S64, S1}, {S16, S1}})
840     .scalarize(0)
841     .clampScalar(0, S32, S64)
842     .widenScalarToNextPow2(1, 32);
843 
844   // TODO: Split s1->s64 during regbankselect for VALU.
845   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
846                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
847                     .lowerIf(typeIs(1, S1))
848                     .customFor({{S32, S64}, {S64, S64}});
849   if (ST.has16BitInsts())
850     IToFP.legalFor({{S16, S16}});
851   IToFP.clampScalar(1, S32, S64)
852        .minScalar(0, S32)
853        .scalarize(0)
854        .widenScalarToNextPow2(1);
855 
856   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
857     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
858     .customFor({{S64, S32}, {S64, S64}})
859     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
860   if (ST.has16BitInsts())
861     FPToI.legalFor({{S16, S16}});
862   else
863     FPToI.minScalar(1, S32);
864 
865   FPToI.minScalar(0, S32)
866        .widenScalarToNextPow2(0, 32)
867        .scalarize(0)
868        .lower();
869 
870   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
871       .customFor({S16, S32})
872       .scalarize(0)
873       .lower();
874 
875   // Lower roundeven into G_FRINT
876   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
877     .scalarize(0)
878     .lower();
879 
880   if (ST.has16BitInsts()) {
881     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
882       .legalFor({S16, S32, S64})
883       .clampScalar(0, S16, S64)
884       .scalarize(0);
885   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
886     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
887       .legalFor({S32, S64})
888       .clampScalar(0, S32, S64)
889       .scalarize(0);
890   } else {
891     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
892       .legalFor({S32})
893       .customFor({S64})
894       .clampScalar(0, S32, S64)
895       .scalarize(0);
896   }
897 
898   getActionDefinitionsBuilder(G_PTR_ADD)
899     .legalIf(all(isPointer(0), sameSize(0, 1)))
900     .scalarize(0)
901     .scalarSameSizeAs(1, 0);
902 
903   getActionDefinitionsBuilder(G_PTRMASK)
904     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
905     .scalarSameSizeAs(1, 0)
906     .scalarize(0);
907 
908   auto &CmpBuilder =
909     getActionDefinitionsBuilder(G_ICMP)
910     // The compare output type differs based on the register bank of the output,
911     // so make both s1 and s32 legal.
912     //
913     // Scalar compares producing output in scc will be promoted to s32, as that
914     // is the allocatable register type that will be needed for the copy from
915     // scc. This will be promoted during RegBankSelect, and we assume something
916     // before that won't try to use s32 result types.
917     //
918     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
919     // bank.
920     .legalForCartesianProduct(
921       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
922     .legalForCartesianProduct(
923       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
924   if (ST.has16BitInsts()) {
925     CmpBuilder.legalFor({{S1, S16}});
926   }
927 
928   CmpBuilder
929     .widenScalarToNextPow2(1)
930     .clampScalar(1, S32, S64)
931     .scalarize(0)
932     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
933 
934   getActionDefinitionsBuilder(G_FCMP)
935     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
936     .widenScalarToNextPow2(1)
937     .clampScalar(1, S32, S64)
938     .scalarize(0);
939 
940   // FIXME: fpow has a selection pattern that should move to custom lowering.
941   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
942   if (ST.has16BitInsts())
943     Exp2Ops.legalFor({S32, S16});
944   else
945     Exp2Ops.legalFor({S32});
946   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
947   Exp2Ops.scalarize(0);
948 
949   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
950   if (ST.has16BitInsts())
951     ExpOps.customFor({{S32}, {S16}});
952   else
953     ExpOps.customFor({S32});
954   ExpOps.clampScalar(0, MinScalarFPTy, S32)
955         .scalarize(0);
956 
957   getActionDefinitionsBuilder(G_FPOWI)
958     .clampScalar(0, MinScalarFPTy, S32)
959     .lower();
960 
961   // The 64-bit versions produce 32-bit results, but only on the SALU.
962   getActionDefinitionsBuilder(G_CTPOP)
963     .legalFor({{S32, S32}, {S32, S64}})
964     .clampScalar(0, S32, S32)
965     .widenScalarToNextPow2(1, 32)
966     .clampScalar(1, S32, S64)
967     .scalarize(0)
968     .widenScalarToNextPow2(0, 32);
969 
970 
971   // The hardware instructions return a different result on 0 than the generic
972   // instructions expect. The hardware produces -1, but these produce the
973   // bitwidth.
974   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
975     .scalarize(0)
976     .clampScalar(0, S32, S32)
977     .clampScalar(1, S32, S64)
978     .widenScalarToNextPow2(0, 32)
979     .widenScalarToNextPow2(1, 32)
980     .custom();
981 
982   // The 64-bit versions produce 32-bit results, but only on the SALU.
983   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
984     .legalFor({{S32, S32}, {S32, S64}})
985     .clampScalar(0, S32, S32)
986     .clampScalar(1, S32, S64)
987     .scalarize(0)
988     .widenScalarToNextPow2(0, 32)
989     .widenScalarToNextPow2(1, 32);
990 
991   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
992   // RegBankSelect.
993   getActionDefinitionsBuilder(G_BITREVERSE)
994     .legalFor({S32, S64})
995     .clampScalar(0, S32, S64)
996     .scalarize(0)
997     .widenScalarToNextPow2(0);
998 
999   if (ST.has16BitInsts()) {
1000     getActionDefinitionsBuilder(G_BSWAP)
1001       .legalFor({S16, S32, V2S16})
1002       .clampMaxNumElementsStrict(0, S16, 2)
1003       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004       // narrowScalar limitation.
1005       .widenScalarToNextPow2(0)
1006       .clampScalar(0, S16, S32)
1007       .scalarize(0);
1008 
1009     if (ST.hasVOP3PInsts()) {
1010       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011         .legalFor({S32, S16, V2S16})
1012         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1013         .clampMaxNumElements(0, S16, 2)
1014         .minScalar(0, S16)
1015         .widenScalarToNextPow2(0)
1016         .scalarize(0)
1017         .lower();
1018     } else {
1019       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1020         .legalFor({S32, S16})
1021         .widenScalarToNextPow2(0)
1022         .minScalar(0, S16)
1023         .scalarize(0)
1024         .lower();
1025     }
1026   } else {
1027     // TODO: Should have same legality without v_perm_b32
1028     getActionDefinitionsBuilder(G_BSWAP)
1029       .legalFor({S32})
1030       .lowerIf(scalarNarrowerThan(0, 32))
1031       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1032       // narrowScalar limitation.
1033       .widenScalarToNextPow2(0)
1034       .maxScalar(0, S32)
1035       .scalarize(0)
1036       .lower();
1037 
1038     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1039       .legalFor({S32})
1040       .minScalar(0, S32)
1041       .widenScalarToNextPow2(0)
1042       .scalarize(0)
1043       .lower();
1044   }
1045 
1046   getActionDefinitionsBuilder(G_INTTOPTR)
1047     // List the common cases
1048     .legalForCartesianProduct(AddrSpaces64, {S64})
1049     .legalForCartesianProduct(AddrSpaces32, {S32})
1050     .scalarize(0)
1051     // Accept any address space as long as the size matches
1052     .legalIf(sameSize(0, 1))
1053     .widenScalarIf(smallerThan(1, 0),
1054       [](const LegalityQuery &Query) {
1055         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1056       })
1057     .narrowScalarIf(largerThan(1, 0),
1058       [](const LegalityQuery &Query) {
1059         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1060       });
1061 
1062   getActionDefinitionsBuilder(G_PTRTOINT)
1063     // List the common cases
1064     .legalForCartesianProduct(AddrSpaces64, {S64})
1065     .legalForCartesianProduct(AddrSpaces32, {S32})
1066     .scalarize(0)
1067     // Accept any address space as long as the size matches
1068     .legalIf(sameSize(0, 1))
1069     .widenScalarIf(smallerThan(0, 1),
1070       [](const LegalityQuery &Query) {
1071         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1072       })
1073     .narrowScalarIf(
1074       largerThan(0, 1),
1075       [](const LegalityQuery &Query) {
1076         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1077       });
1078 
1079   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1080     .scalarize(0)
1081     .custom();
1082 
1083   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1084                                     bool IsLoad) -> bool {
1085     const LLT DstTy = Query.Types[0];
1086 
1087     // Split vector extloads.
1088     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1089 
1090     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1091       return true;
1092 
1093     const LLT PtrTy = Query.Types[1];
1094     unsigned AS = PtrTy.getAddressSpace();
1095     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1096       return true;
1097 
1098     // Catch weird sized loads that don't evenly divide into the access sizes
1099     // TODO: May be able to widen depending on alignment etc.
1100     unsigned NumRegs = (MemSize + 31) / 32;
1101     if (NumRegs == 3) {
1102       if (!ST.hasDwordx3LoadStores())
1103         return true;
1104     } else {
1105       // If the alignment allows, these should have been widened.
1106       if (!isPowerOf2_32(NumRegs))
1107         return true;
1108     }
1109 
1110     return false;
1111   };
1112 
1113   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1114   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1115   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1116 
1117   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1118   // LDS
1119   // TODO: Unsupported flat for SI.
1120 
1121   for (unsigned Op : {G_LOAD, G_STORE}) {
1122     const bool IsStore = Op == G_STORE;
1123 
1124     auto &Actions = getActionDefinitionsBuilder(Op);
1125     // Explicitly list some common cases.
1126     // TODO: Does this help compile time at all?
1127     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1128                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1129                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1130                                       {S64, GlobalPtr, S64, GlobalAlign32},
1131                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1132                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1133                                       {S32, GlobalPtr, S8, GlobalAlign8},
1134                                       {S32, GlobalPtr, S16, GlobalAlign16},
1135 
1136                                       {S32, LocalPtr, S32, 32},
1137                                       {S64, LocalPtr, S64, 32},
1138                                       {V2S32, LocalPtr, V2S32, 32},
1139                                       {S32, LocalPtr, S8, 8},
1140                                       {S32, LocalPtr, S16, 16},
1141                                       {V2S16, LocalPtr, S32, 32},
1142 
1143                                       {S32, PrivatePtr, S32, 32},
1144                                       {S32, PrivatePtr, S8, 8},
1145                                       {S32, PrivatePtr, S16, 16},
1146                                       {V2S16, PrivatePtr, S32, 32},
1147 
1148                                       {S32, ConstantPtr, S32, GlobalAlign32},
1149                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1150                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1151                                       {S64, ConstantPtr, S64, GlobalAlign32},
1152                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1153     Actions.legalIf(
1154       [=](const LegalityQuery &Query) -> bool {
1155         return isLoadStoreLegal(ST, Query);
1156       });
1157 
1158     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1159     // 64-bits.
1160     //
1161     // TODO: Should generalize bitcast action into coerce, which will also cover
1162     // inserting addrspacecasts.
1163     Actions.customIf(typeIs(1, Constant32Ptr));
1164 
1165     // Turn any illegal element vectors into something easier to deal
1166     // with. These will ultimately produce 32-bit scalar shifts to extract the
1167     // parts anyway.
1168     //
1169     // For odd 16-bit element vectors, prefer to split those into pieces with
1170     // 16-bit vector parts.
1171     Actions.bitcastIf(
1172       [=](const LegalityQuery &Query) -> bool {
1173         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1174                                           Query.MMODescrs[0].MemoryTy);
1175       }, bitcastToRegisterType(0));
1176 
1177     if (!IsStore) {
1178       // Widen suitably aligned loads by loading extra bytes. The standard
1179       // legalization actions can't properly express widening memory operands.
1180       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1181         return shouldWidenLoad(ST, Query, G_LOAD);
1182       });
1183     }
1184 
1185     // FIXME: load/store narrowing should be moved to lower action
1186     Actions
1187         .narrowScalarIf(
1188             [=](const LegalityQuery &Query) -> bool {
1189               return !Query.Types[0].isVector() &&
1190                      needToSplitMemOp(Query, Op == G_LOAD);
1191             },
1192             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1193               const LLT DstTy = Query.Types[0];
1194               const LLT PtrTy = Query.Types[1];
1195 
1196               const unsigned DstSize = DstTy.getSizeInBits();
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198 
1199               // Split extloads.
1200               if (DstSize > MemSize)
1201                 return std::make_pair(0, LLT::scalar(MemSize));
1202 
1203               unsigned MaxSize = maxSizeForAddrSpace(ST,
1204                                                      PtrTy.getAddressSpace(),
1205                                                      Op == G_LOAD);
1206               if (MemSize > MaxSize)
1207                 return std::make_pair(0, LLT::scalar(MaxSize));
1208 
1209               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1210               return std::make_pair(0, LLT::scalar(Align));
1211             })
1212         .fewerElementsIf(
1213             [=](const LegalityQuery &Query) -> bool {
1214               return Query.Types[0].isVector() &&
1215                      needToSplitMemOp(Query, Op == G_LOAD);
1216             },
1217             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1218               const LLT DstTy = Query.Types[0];
1219               const LLT PtrTy = Query.Types[1];
1220 
1221               LLT EltTy = DstTy.getElementType();
1222               unsigned MaxSize = maxSizeForAddrSpace(ST,
1223                                                      PtrTy.getAddressSpace(),
1224                                                      Op == G_LOAD);
1225 
1226               // FIXME: Handle widened to power of 2 results better. This ends
1227               // up scalarizing.
1228               // FIXME: 3 element stores scalarized on SI
1229 
1230               // Split if it's too large for the address space.
1231               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1232               if (MemSize > MaxSize) {
1233                 unsigned NumElts = DstTy.getNumElements();
1234                 unsigned EltSize = EltTy.getSizeInBits();
1235 
1236                 if (MaxSize % EltSize == 0) {
1237                   return std::make_pair(
1238                       0, LLT::scalarOrVector(
1239                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1240                 }
1241 
1242                 unsigned NumPieces = MemSize / MaxSize;
1243 
1244                 // FIXME: Refine when odd breakdowns handled
1245                 // The scalars will need to be re-legalized.
1246                 if (NumPieces == 1 || NumPieces >= NumElts ||
1247                     NumElts % NumPieces != 0)
1248                   return std::make_pair(0, EltTy);
1249 
1250                 return std::make_pair(
1251                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1252               }
1253 
1254               // FIXME: We could probably handle weird extending loads better.
1255               if (DstTy.getSizeInBits() > MemSize)
1256                 return std::make_pair(0, EltTy);
1257 
1258               unsigned EltSize = EltTy.getSizeInBits();
1259               unsigned DstSize = DstTy.getSizeInBits();
1260               if (!isPowerOf2_32(DstSize)) {
1261                 // We're probably decomposing an odd sized store. Try to split
1262                 // to the widest type. TODO: Account for alignment. As-is it
1263                 // should be OK, since the new parts will be further legalized.
1264                 unsigned FloorSize = PowerOf2Floor(DstSize);
1265                 return std::make_pair(
1266                     0, LLT::scalarOrVector(
1267                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1268               }
1269 
1270               // May need relegalization for the scalars.
1271               return std::make_pair(0, EltTy);
1272             })
1273     .minScalar(0, S32)
1274     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1275     .widenScalarToNextPow2(0)
1276     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1277     .lower();
1278   }
1279 
1280   // FIXME: Unaligned accesses not lowered.
1281   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1282                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1283                                                   {S32, GlobalPtr, S16, 2 * 8},
1284                                                   {S32, LocalPtr, S8, 8},
1285                                                   {S32, LocalPtr, S16, 16},
1286                                                   {S32, PrivatePtr, S8, 8},
1287                                                   {S32, PrivatePtr, S16, 16},
1288                                                   {S32, ConstantPtr, S8, 8},
1289                                                   {S32, ConstantPtr, S16, 2 * 8}})
1290                        .legalIf(
1291                          [=](const LegalityQuery &Query) -> bool {
1292                            return isLoadStoreLegal(ST, Query);
1293                          });
1294 
1295   if (ST.hasFlatAddressSpace()) {
1296     ExtLoads.legalForTypesWithMemDesc(
1297         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1298   }
1299 
1300   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1301   // 64-bits.
1302   //
1303   // TODO: Should generalize bitcast action into coerce, which will also cover
1304   // inserting addrspacecasts.
1305   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1306 
1307   ExtLoads.clampScalar(0, S32, S32)
1308           .widenScalarToNextPow2(0)
1309           .lower();
1310 
1311   auto &Atomics = getActionDefinitionsBuilder(
1312     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1313      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1314      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1315      G_ATOMICRMW_UMIN})
1316     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1317                {S64, GlobalPtr}, {S64, LocalPtr},
1318                {S32, RegionPtr}, {S64, RegionPtr}});
1319   if (ST.hasFlatAddressSpace()) {
1320     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1321   }
1322 
1323   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1324   if (ST.hasLDSFPAtomicAdd()) {
1325     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1326     if (ST.hasGFX90AInsts())
1327       Atomic.legalFor({{S64, LocalPtr}});
1328     if (ST.hasGFX940Insts())
1329       Atomic.legalFor({{V2S16, LocalPtr}});
1330   }
1331   if (ST.hasAtomicFaddInsts())
1332     Atomic.legalFor({{S32, GlobalPtr}});
1333 
1334   if (ST.hasGFX90AInsts()) {
1335     // These are legal with some caveats, and should have undergone expansion in
1336     // the IR in most situations
1337     // TODO: Move atomic expansion into legalizer
1338     // TODO: Also supports <2 x f16>
1339     Atomic.legalFor({
1340         {S32, GlobalPtr},
1341         {S64, GlobalPtr},
1342         {S64, FlatPtr}
1343       });
1344   }
1345 
1346   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1347   // demarshalling
1348   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1349     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1350                 {S32, FlatPtr}, {S64, FlatPtr}})
1351     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1352                {S32, RegionPtr}, {S64, RegionPtr}});
1353   // TODO: Pointer types, any 32-bit or 64-bit vector
1354 
1355   // Condition should be s32 for scalar, s1 for vector.
1356   getActionDefinitionsBuilder(G_SELECT)
1357       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1358                                  LocalPtr, FlatPtr, PrivatePtr,
1359                                  LLT::fixed_vector(2, LocalPtr),
1360                                  LLT::fixed_vector(2, PrivatePtr)},
1361                                 {S1, S32})
1362       .clampScalar(0, S16, S64)
1363       .scalarize(1)
1364       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1365       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1366       .clampMaxNumElements(0, S32, 2)
1367       .clampMaxNumElements(0, LocalPtr, 2)
1368       .clampMaxNumElements(0, PrivatePtr, 2)
1369       .scalarize(0)
1370       .widenScalarToNextPow2(0)
1371       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1372 
1373   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1374   // be more flexible with the shift amount type.
1375   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1376     .legalFor({{S32, S32}, {S64, S32}});
1377   if (ST.has16BitInsts()) {
1378     if (ST.hasVOP3PInsts()) {
1379       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1380             .clampMaxNumElements(0, S16, 2);
1381     } else
1382       Shifts.legalFor({{S16, S16}});
1383 
1384     // TODO: Support 16-bit shift amounts for all types
1385     Shifts.widenScalarIf(
1386       [=](const LegalityQuery &Query) {
1387         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1388         // 32-bit amount.
1389         const LLT ValTy = Query.Types[0];
1390         const LLT AmountTy = Query.Types[1];
1391         return ValTy.getSizeInBits() <= 16 &&
1392                AmountTy.getSizeInBits() < 16;
1393       }, changeTo(1, S16));
1394     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1395     Shifts.clampScalar(1, S32, S32);
1396     Shifts.widenScalarToNextPow2(0, 16);
1397     Shifts.clampScalar(0, S16, S64);
1398 
1399     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1400       .minScalar(0, S16)
1401       .scalarize(0)
1402       .lower();
1403   } else {
1404     // Make sure we legalize the shift amount type first, as the general
1405     // expansion for the shifted type will produce much worse code if it hasn't
1406     // been truncated already.
1407     Shifts.clampScalar(1, S32, S32);
1408     Shifts.widenScalarToNextPow2(0, 32);
1409     Shifts.clampScalar(0, S32, S64);
1410 
1411     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1412       .minScalar(0, S32)
1413       .scalarize(0)
1414       .lower();
1415   }
1416   Shifts.scalarize(0);
1417 
1418   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1419     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1420     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1421     unsigned IdxTypeIdx = 2;
1422 
1423     getActionDefinitionsBuilder(Op)
1424       .customIf([=](const LegalityQuery &Query) {
1425           const LLT EltTy = Query.Types[EltTypeIdx];
1426           const LLT VecTy = Query.Types[VecTypeIdx];
1427           const LLT IdxTy = Query.Types[IdxTypeIdx];
1428           const unsigned EltSize = EltTy.getSizeInBits();
1429           return (EltSize == 32 || EltSize == 64) &&
1430                   VecTy.getSizeInBits() % 32 == 0 &&
1431                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1432                   IdxTy.getSizeInBits() == 32;
1433         })
1434       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1435                  bitcastToVectorElement32(VecTypeIdx))
1436       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1437       .bitcastIf(
1438         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1439         [=](const LegalityQuery &Query) {
1440           // For > 64-bit element types, try to turn this into a 64-bit
1441           // element vector since we may be able to do better indexing
1442           // if this is scalar. If not, fall back to 32.
1443           const LLT EltTy = Query.Types[EltTypeIdx];
1444           const LLT VecTy = Query.Types[VecTypeIdx];
1445           const unsigned DstEltSize = EltTy.getSizeInBits();
1446           const unsigned VecSize = VecTy.getSizeInBits();
1447 
1448           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1449           return std::make_pair(
1450               VecTypeIdx,
1451               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1452         })
1453       .clampScalar(EltTypeIdx, S32, S64)
1454       .clampScalar(VecTypeIdx, S32, S64)
1455       .clampScalar(IdxTypeIdx, S32, S32)
1456       .clampMaxNumElements(VecTypeIdx, S32, 32)
1457       // TODO: Clamp elements for 64-bit vectors?
1458       // It should only be necessary with variable indexes.
1459       // As a last resort, lower to the stack
1460       .lower();
1461   }
1462 
1463   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1464     .unsupportedIf([=](const LegalityQuery &Query) {
1465         const LLT &EltTy = Query.Types[1].getElementType();
1466         return Query.Types[0] != EltTy;
1467       });
1468 
1469   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1470     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1471     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1472 
1473     // FIXME: Doesn't handle extract of illegal sizes.
1474     getActionDefinitionsBuilder(Op)
1475       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1476       .lowerIf([=](const LegalityQuery &Query) {
1477           // Sub-vector(or single element) insert and extract.
1478           // TODO: verify immediate offset here since lower only works with
1479           // whole elements.
1480           const LLT BigTy = Query.Types[BigTyIdx];
1481           return BigTy.isVector();
1482         })
1483       // FIXME: Multiples of 16 should not be legal.
1484       .legalIf([=](const LegalityQuery &Query) {
1485           const LLT BigTy = Query.Types[BigTyIdx];
1486           const LLT LitTy = Query.Types[LitTyIdx];
1487           return (BigTy.getSizeInBits() % 32 == 0) &&
1488                  (LitTy.getSizeInBits() % 16 == 0);
1489         })
1490       .widenScalarIf(
1491         [=](const LegalityQuery &Query) {
1492           const LLT BigTy = Query.Types[BigTyIdx];
1493           return (BigTy.getScalarSizeInBits() < 16);
1494         },
1495         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1496       .widenScalarIf(
1497         [=](const LegalityQuery &Query) {
1498           const LLT LitTy = Query.Types[LitTyIdx];
1499           return (LitTy.getScalarSizeInBits() < 16);
1500         },
1501         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1502       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1503       .widenScalarToNextPow2(BigTyIdx, 32);
1504 
1505   }
1506 
1507   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1508     .legalForCartesianProduct(AllS32Vectors, {S32})
1509     .legalForCartesianProduct(AllS64Vectors, {S64})
1510     .clampNumElements(0, V16S32, V32S32)
1511     .clampNumElements(0, V2S64, V16S64)
1512     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1513 
1514   if (ST.hasScalarPackInsts()) {
1515     BuildVector
1516       // FIXME: Should probably widen s1 vectors straight to s32
1517       .minScalarOrElt(0, S16)
1518       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1519       .minScalar(1, S32);
1520 
1521     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1522       .legalFor({V2S16, S32})
1523       .lower();
1524     BuildVector.minScalarOrElt(0, S32);
1525   } else {
1526     BuildVector.customFor({V2S16, S16});
1527     BuildVector.minScalarOrElt(0, S32);
1528 
1529     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1530       .customFor({V2S16, S32})
1531       .lower();
1532   }
1533 
1534   BuildVector.legalIf(isRegisterType(0));
1535 
1536   // FIXME: Clamp maximum size
1537   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1538     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1539     .clampMaxNumElements(0, S32, 32)
1540     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1541     .clampMaxNumElements(0, S16, 64);
1542 
1543   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1544   // pre-legalize.
1545   if (ST.hasVOP3PInsts()) {
1546     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1547       .customFor({V2S16, V2S16})
1548       .lower();
1549   } else
1550     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1551 
1552   // Merge/Unmerge
1553   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1554     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1555     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1556 
1557     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1558       const LLT Ty = Query.Types[TypeIdx];
1559       if (Ty.isVector()) {
1560         const LLT &EltTy = Ty.getElementType();
1561         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1562           return true;
1563         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1564           return true;
1565       }
1566       return false;
1567     };
1568 
1569     auto &Builder = getActionDefinitionsBuilder(Op)
1570       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1571       .lowerFor({{S16, V2S16}})
1572       .lowerIf([=](const LegalityQuery &Query) {
1573           const LLT BigTy = Query.Types[BigTyIdx];
1574           return BigTy.getSizeInBits() == 32;
1575         })
1576       // Try to widen to s16 first for small types.
1577       // TODO: Only do this on targets with legal s16 shifts
1578       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1579       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1580       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1581       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1582                            elementTypeIs(1, S16)),
1583                        changeTo(1, V2S16))
1584       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1585       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1586       // valid.
1587       .clampScalar(LitTyIdx, S32, S512)
1588       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1589       // Break up vectors with weird elements into scalars
1590       .fewerElementsIf(
1591         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1592         scalarize(0))
1593       .fewerElementsIf(
1594         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1595         scalarize(1))
1596       .clampScalar(BigTyIdx, S32, MaxScalar);
1597 
1598     if (Op == G_MERGE_VALUES) {
1599       Builder.widenScalarIf(
1600         // TODO: Use 16-bit shifts if legal for 8-bit values?
1601         [=](const LegalityQuery &Query) {
1602           const LLT Ty = Query.Types[LitTyIdx];
1603           return Ty.getSizeInBits() < 32;
1604         },
1605         changeTo(LitTyIdx, S32));
1606     }
1607 
1608     Builder.widenScalarIf(
1609       [=](const LegalityQuery &Query) {
1610         const LLT Ty = Query.Types[BigTyIdx];
1611         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1612           Ty.getSizeInBits() % 16 != 0;
1613       },
1614       [=](const LegalityQuery &Query) {
1615         // Pick the next power of 2, or a multiple of 64 over 128.
1616         // Whichever is smaller.
1617         const LLT &Ty = Query.Types[BigTyIdx];
1618         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1619         if (NewSizeInBits >= 256) {
1620           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1621           if (RoundedTo < NewSizeInBits)
1622             NewSizeInBits = RoundedTo;
1623         }
1624         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1625       })
1626       // Any vectors left are the wrong size. Scalarize them.
1627       .scalarize(0)
1628       .scalarize(1);
1629   }
1630 
1631   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1632   // RegBankSelect.
1633   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1634     .legalFor({{S32}, {S64}});
1635 
1636   if (ST.hasVOP3PInsts()) {
1637     SextInReg.lowerFor({{V2S16}})
1638       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1639       // get more vector shift opportunities, since we'll get those when
1640       // expanded.
1641       .clampMaxNumElementsStrict(0, S16, 2);
1642   } else if (ST.has16BitInsts()) {
1643     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1644   } else {
1645     // Prefer to promote to s32 before lowering if we don't have 16-bit
1646     // shifts. This avoid a lot of intermediate truncate and extend operations.
1647     SextInReg.lowerFor({{S32}, {S64}});
1648   }
1649 
1650   SextInReg
1651     .scalarize(0)
1652     .clampScalar(0, S32, S64)
1653     .lower();
1654 
1655   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1656     .scalarize(0)
1657     .lower();
1658 
1659   // TODO: Only Try to form v2s16 with legal packed instructions.
1660   getActionDefinitionsBuilder(G_FSHR)
1661     .legalFor({{S32, S32}})
1662     .lowerFor({{V2S16, V2S16}})
1663     .clampMaxNumElementsStrict(0, S16, 2)
1664     .scalarize(0)
1665     .lower();
1666 
1667   if (ST.hasVOP3PInsts()) {
1668     getActionDefinitionsBuilder(G_FSHL)
1669       .lowerFor({{V2S16, V2S16}})
1670       .clampMaxNumElementsStrict(0, S16, 2)
1671       .scalarize(0)
1672       .lower();
1673   } else {
1674     getActionDefinitionsBuilder(G_FSHL)
1675       .scalarize(0)
1676       .lower();
1677   }
1678 
1679   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1680     .legalFor({S64});
1681 
1682   getActionDefinitionsBuilder(G_FENCE)
1683     .alwaysLegal();
1684 
1685   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1686       .scalarize(0)
1687       .minScalar(0, S32)
1688       .lower();
1689 
1690   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1691       .legalFor({{S32, S32}, {S64, S32}})
1692       .clampScalar(1, S32, S32)
1693       .clampScalar(0, S32, S64)
1694       .widenScalarToNextPow2(0)
1695       .scalarize(0);
1696 
1697   getActionDefinitionsBuilder({
1698       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1699       G_FCOPYSIGN,
1700 
1701       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1702       G_ATOMICRMW_NAND,
1703       G_ATOMICRMW_FSUB,
1704       G_READ_REGISTER,
1705       G_WRITE_REGISTER,
1706 
1707       G_SADDO, G_SSUBO,
1708 
1709        // TODO: Implement
1710       G_FMINIMUM, G_FMAXIMUM}).lower();
1711 
1712   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1713       .lower();
1714 
1715   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1716         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1717         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1718     .unsupported();
1719 
1720   getLegacyLegalizerInfo().computeTables();
1721   verify(*ST.getInstrInfo());
1722 }
1723 
1724 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1725                                          MachineInstr &MI) const {
1726   MachineIRBuilder &B = Helper.MIRBuilder;
1727   MachineRegisterInfo &MRI = *B.getMRI();
1728 
1729   switch (MI.getOpcode()) {
1730   case TargetOpcode::G_ADDRSPACE_CAST:
1731     return legalizeAddrSpaceCast(MI, MRI, B);
1732   case TargetOpcode::G_FRINT:
1733     return legalizeFrint(MI, MRI, B);
1734   case TargetOpcode::G_FCEIL:
1735     return legalizeFceil(MI, MRI, B);
1736   case TargetOpcode::G_FREM:
1737     return legalizeFrem(MI, MRI, B);
1738   case TargetOpcode::G_INTRINSIC_TRUNC:
1739     return legalizeIntrinsicTrunc(MI, MRI, B);
1740   case TargetOpcode::G_SITOFP:
1741     return legalizeITOFP(MI, MRI, B, true);
1742   case TargetOpcode::G_UITOFP:
1743     return legalizeITOFP(MI, MRI, B, false);
1744   case TargetOpcode::G_FPTOSI:
1745     return legalizeFPTOI(MI, MRI, B, true);
1746   case TargetOpcode::G_FPTOUI:
1747     return legalizeFPTOI(MI, MRI, B, false);
1748   case TargetOpcode::G_FMINNUM:
1749   case TargetOpcode::G_FMAXNUM:
1750   case TargetOpcode::G_FMINNUM_IEEE:
1751   case TargetOpcode::G_FMAXNUM_IEEE:
1752     return legalizeMinNumMaxNum(Helper, MI);
1753   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1754     return legalizeExtractVectorElt(MI, MRI, B);
1755   case TargetOpcode::G_INSERT_VECTOR_ELT:
1756     return legalizeInsertVectorElt(MI, MRI, B);
1757   case TargetOpcode::G_SHUFFLE_VECTOR:
1758     return legalizeShuffleVector(MI, MRI, B);
1759   case TargetOpcode::G_FSIN:
1760   case TargetOpcode::G_FCOS:
1761     return legalizeSinCos(MI, MRI, B);
1762   case TargetOpcode::G_GLOBAL_VALUE:
1763     return legalizeGlobalValue(MI, MRI, B);
1764   case TargetOpcode::G_LOAD:
1765   case TargetOpcode::G_SEXTLOAD:
1766   case TargetOpcode::G_ZEXTLOAD:
1767     return legalizeLoad(Helper, MI);
1768   case TargetOpcode::G_FMAD:
1769     return legalizeFMad(MI, MRI, B);
1770   case TargetOpcode::G_FDIV:
1771     return legalizeFDIV(MI, MRI, B);
1772   case TargetOpcode::G_UDIV:
1773   case TargetOpcode::G_UREM:
1774   case TargetOpcode::G_UDIVREM:
1775     return legalizeUnsignedDIV_REM(MI, MRI, B);
1776   case TargetOpcode::G_SDIV:
1777   case TargetOpcode::G_SREM:
1778   case TargetOpcode::G_SDIVREM:
1779     return legalizeSignedDIV_REM(MI, MRI, B);
1780   case TargetOpcode::G_ATOMIC_CMPXCHG:
1781     return legalizeAtomicCmpXChg(MI, MRI, B);
1782   case TargetOpcode::G_FLOG:
1783     return legalizeFlog(MI, B, numbers::ln2f);
1784   case TargetOpcode::G_FLOG10:
1785     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1786   case TargetOpcode::G_FEXP:
1787     return legalizeFExp(MI, B);
1788   case TargetOpcode::G_FPOW:
1789     return legalizeFPow(MI, B);
1790   case TargetOpcode::G_FFLOOR:
1791     return legalizeFFloor(MI, MRI, B);
1792   case TargetOpcode::G_BUILD_VECTOR:
1793     return legalizeBuildVector(MI, MRI, B);
1794   case TargetOpcode::G_MUL:
1795     return legalizeMul(Helper, MI);
1796   case TargetOpcode::G_CTLZ:
1797   case TargetOpcode::G_CTTZ:
1798     return legalizeCTLZ_CTTZ(MI, MRI, B);
1799   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1800     return legalizeFPTruncRound(MI, B);
1801   default:
1802     return false;
1803   }
1804 
1805   llvm_unreachable("expected switch to return");
1806 }
1807 
1808 Register AMDGPULegalizerInfo::getSegmentAperture(
1809   unsigned AS,
1810   MachineRegisterInfo &MRI,
1811   MachineIRBuilder &B) const {
1812   MachineFunction &MF = B.getMF();
1813   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1814   const LLT S32 = LLT::scalar(32);
1815 
1816   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1817 
1818   if (ST.hasApertureRegs()) {
1819     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1820     // getreg.
1821     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1822         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1823         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1824     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1825         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1826         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1827     unsigned Encoding =
1828         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1829         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1830         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1831 
1832     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1833 
1834     B.buildInstr(AMDGPU::S_GETREG_B32)
1835       .addDef(GetReg)
1836       .addImm(Encoding);
1837     MRI.setType(GetReg, S32);
1838 
1839     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1840     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1841   }
1842 
1843   // TODO: can we be smarter about machine pointer info?
1844   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1845   Register LoadAddr = MRI.createGenericVirtualRegister(
1846     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1847   // For code object version 5, private_base and shared_base are passed through
1848   // implicit kernargs.
1849   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1850     AMDGPUTargetLowering::ImplicitParameter Param =
1851         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1852                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1853     uint64_t Offset =
1854         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1855 
1856     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1857         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1858 
1859     if (!loadInputValue(KernargPtrReg, B,
1860                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1861       return Register();
1862 
1863     MachineMemOperand *MMO = MF.getMachineMemOperand(
1864         PtrInfo,
1865         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1866             MachineMemOperand::MOInvariant,
1867         LLT::scalar(32), commonAlignment(Align(64), Offset));
1868 
1869     // Pointer address
1870     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1871                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1872     // Load address
1873     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1874   }
1875 
1876   Register QueuePtr = MRI.createGenericVirtualRegister(
1877     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1878 
1879   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1880     return Register();
1881 
1882   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1883   // private_segment_aperture_base_hi.
1884   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1885 
1886   MachineMemOperand *MMO = MF.getMachineMemOperand(
1887       PtrInfo,
1888       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1889           MachineMemOperand::MOInvariant,
1890       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1891 
1892   B.buildPtrAdd(LoadAddr, QueuePtr,
1893                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1894   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1895 }
1896 
1897 /// Return true if the value is a known valid address, such that a null check is
1898 /// not necessary.
1899 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1900                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1901   MachineInstr *Def = MRI.getVRegDef(Val);
1902   switch (Def->getOpcode()) {
1903   case AMDGPU::G_FRAME_INDEX:
1904   case AMDGPU::G_GLOBAL_VALUE:
1905   case AMDGPU::G_BLOCK_ADDR:
1906     return true;
1907   case AMDGPU::G_CONSTANT: {
1908     const ConstantInt *CI = Def->getOperand(1).getCImm();
1909     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1910   }
1911   default:
1912     return false;
1913   }
1914 
1915   return false;
1916 }
1917 
1918 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1919   MachineInstr &MI, MachineRegisterInfo &MRI,
1920   MachineIRBuilder &B) const {
1921   MachineFunction &MF = B.getMF();
1922 
1923   const LLT S32 = LLT::scalar(32);
1924   Register Dst = MI.getOperand(0).getReg();
1925   Register Src = MI.getOperand(1).getReg();
1926 
1927   LLT DstTy = MRI.getType(Dst);
1928   LLT SrcTy = MRI.getType(Src);
1929   unsigned DestAS = DstTy.getAddressSpace();
1930   unsigned SrcAS = SrcTy.getAddressSpace();
1931 
1932   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1933   // vector element.
1934   assert(!DstTy.isVector());
1935 
1936   const AMDGPUTargetMachine &TM
1937     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1938 
1939   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1940     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1941     return true;
1942   }
1943 
1944   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1945       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1946        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1947     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1948       // Extract low 32-bits of the pointer.
1949       B.buildExtract(Dst, Src, 0);
1950       MI.eraseFromParent();
1951       return true;
1952     }
1953 
1954     unsigned NullVal = TM.getNullPointerValue(DestAS);
1955 
1956     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1957     auto FlatNull = B.buildConstant(SrcTy, 0);
1958 
1959     // Extract low 32-bits of the pointer.
1960     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1961 
1962     auto CmpRes =
1963         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1964     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1965 
1966     MI.eraseFromParent();
1967     return true;
1968   }
1969 
1970   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1971       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1972        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1973     if (!ST.hasFlatAddressSpace())
1974       return false;
1975 
1976     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1977     if (!ApertureReg.isValid())
1978       return false;
1979 
1980     // Coerce the type of the low half of the result so we can use merge_values.
1981     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1982 
1983     // TODO: Should we allow mismatched types but matching sizes in merges to
1984     // avoid the ptrtoint?
1985     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1986 
1987     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1988       B.buildCopy(Dst, BuildPtr);
1989       MI.eraseFromParent();
1990       return true;
1991     }
1992 
1993     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1994     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1995 
1996     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1997                               SegmentNull.getReg(0));
1998 
1999     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2000 
2001     MI.eraseFromParent();
2002     return true;
2003   }
2004 
2005   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2006       SrcTy.getSizeInBits() == 64) {
2007     // Truncate.
2008     B.buildExtract(Dst, Src, 0);
2009     MI.eraseFromParent();
2010     return true;
2011   }
2012 
2013   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2014       DstTy.getSizeInBits() == 64) {
2015     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2016     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2017 
2018     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2019     // another. Merge operands are required to be the same type, but creating an
2020     // extra ptrtoint would be kind of pointless.
2021     auto HighAddr = B.buildConstant(
2022         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
2023     B.buildMerge(Dst, {Src, HighAddr});
2024     MI.eraseFromParent();
2025     return true;
2026   }
2027 
2028   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2029       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2030 
2031   LLVMContext &Ctx = MF.getFunction().getContext();
2032   Ctx.diagnose(InvalidAddrSpaceCast);
2033   B.buildUndef(Dst);
2034   MI.eraseFromParent();
2035   return true;
2036 }
2037 
2038 bool AMDGPULegalizerInfo::legalizeFrint(
2039   MachineInstr &MI, MachineRegisterInfo &MRI,
2040   MachineIRBuilder &B) const {
2041   Register Src = MI.getOperand(1).getReg();
2042   LLT Ty = MRI.getType(Src);
2043   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2044 
2045   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2046   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2047 
2048   auto C1 = B.buildFConstant(Ty, C1Val);
2049   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2050 
2051   // TODO: Should this propagate fast-math-flags?
2052   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2053   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2054 
2055   auto C2 = B.buildFConstant(Ty, C2Val);
2056   auto Fabs = B.buildFAbs(Ty, Src);
2057 
2058   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2059   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2060   MI.eraseFromParent();
2061   return true;
2062 }
2063 
2064 bool AMDGPULegalizerInfo::legalizeFceil(
2065   MachineInstr &MI, MachineRegisterInfo &MRI,
2066   MachineIRBuilder &B) const {
2067 
2068   const LLT S1 = LLT::scalar(1);
2069   const LLT S64 = LLT::scalar(64);
2070 
2071   Register Src = MI.getOperand(1).getReg();
2072   assert(MRI.getType(Src) == S64);
2073 
2074   // result = trunc(src)
2075   // if (src > 0.0 && src != result)
2076   //   result += 1.0
2077 
2078   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2079 
2080   const auto Zero = B.buildFConstant(S64, 0.0);
2081   const auto One = B.buildFConstant(S64, 1.0);
2082   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2083   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2084   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2085   auto Add = B.buildSelect(S64, And, One, Zero);
2086 
2087   // TODO: Should this propagate fast-math-flags?
2088   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2089   MI.eraseFromParent();
2090   return true;
2091 }
2092 
2093 bool AMDGPULegalizerInfo::legalizeFrem(
2094   MachineInstr &MI, MachineRegisterInfo &MRI,
2095   MachineIRBuilder &B) const {
2096     Register DstReg = MI.getOperand(0).getReg();
2097     Register Src0Reg = MI.getOperand(1).getReg();
2098     Register Src1Reg = MI.getOperand(2).getReg();
2099     auto Flags = MI.getFlags();
2100     LLT Ty = MRI.getType(DstReg);
2101 
2102     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2103     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2104     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2105     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2106     MI.eraseFromParent();
2107     return true;
2108 }
2109 
2110 static MachineInstrBuilder extractF64Exponent(Register Hi,
2111                                               MachineIRBuilder &B) {
2112   const unsigned FractBits = 52;
2113   const unsigned ExpBits = 11;
2114   LLT S32 = LLT::scalar(32);
2115 
2116   auto Const0 = B.buildConstant(S32, FractBits - 32);
2117   auto Const1 = B.buildConstant(S32, ExpBits);
2118 
2119   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2120     .addUse(Hi)
2121     .addUse(Const0.getReg(0))
2122     .addUse(Const1.getReg(0));
2123 
2124   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2125 }
2126 
2127 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2128   MachineInstr &MI, MachineRegisterInfo &MRI,
2129   MachineIRBuilder &B) const {
2130   const LLT S1 = LLT::scalar(1);
2131   const LLT S32 = LLT::scalar(32);
2132   const LLT S64 = LLT::scalar(64);
2133 
2134   Register Src = MI.getOperand(1).getReg();
2135   assert(MRI.getType(Src) == S64);
2136 
2137   // TODO: Should this use extract since the low half is unused?
2138   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2139   Register Hi = Unmerge.getReg(1);
2140 
2141   // Extract the upper half, since this is where we will find the sign and
2142   // exponent.
2143   auto Exp = extractF64Exponent(Hi, B);
2144 
2145   const unsigned FractBits = 52;
2146 
2147   // Extract the sign bit.
2148   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2149   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2150 
2151   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2152 
2153   const auto Zero32 = B.buildConstant(S32, 0);
2154 
2155   // Extend back to 64-bits.
2156   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2157 
2158   auto Shr = B.buildAShr(S64, FractMask, Exp);
2159   auto Not = B.buildNot(S64, Shr);
2160   auto Tmp0 = B.buildAnd(S64, Src, Not);
2161   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2162 
2163   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2164   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2165 
2166   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2167   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 bool AMDGPULegalizerInfo::legalizeITOFP(
2173   MachineInstr &MI, MachineRegisterInfo &MRI,
2174   MachineIRBuilder &B, bool Signed) const {
2175 
2176   Register Dst = MI.getOperand(0).getReg();
2177   Register Src = MI.getOperand(1).getReg();
2178 
2179   const LLT S64 = LLT::scalar(64);
2180   const LLT S32 = LLT::scalar(32);
2181 
2182   assert(MRI.getType(Src) == S64);
2183 
2184   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2185   auto ThirtyTwo = B.buildConstant(S32, 32);
2186 
2187   if (MRI.getType(Dst) == S64) {
2188     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2189                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2190 
2191     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2192     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2193                      .addUse(CvtHi.getReg(0))
2194                      .addUse(ThirtyTwo.getReg(0));
2195 
2196     // TODO: Should this propagate fast-math-flags?
2197     B.buildFAdd(Dst, LdExp, CvtLo);
2198     MI.eraseFromParent();
2199     return true;
2200   }
2201 
2202   assert(MRI.getType(Dst) == S32);
2203 
2204   auto One = B.buildConstant(S32, 1);
2205 
2206   MachineInstrBuilder ShAmt;
2207   if (Signed) {
2208     auto ThirtyOne = B.buildConstant(S32, 31);
2209     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2210     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2211     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2212     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2213                                /*HasSideEffects=*/false)
2214                   .addUse(Unmerge.getReg(1));
2215     auto LS2 = B.buildSub(S32, LS, One);
2216     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2217   } else
2218     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2219   auto Norm = B.buildShl(S64, Src, ShAmt);
2220   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2221   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2222   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2223   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2224   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2225   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2226                    /*HasSideEffects=*/false)
2227       .addUse(FVal.getReg(0))
2228       .addUse(Scale.getReg(0));
2229   MI.eraseFromParent();
2230   return true;
2231 }
2232 
2233 // TODO: Copied from DAG implementation. Verify logic and document how this
2234 // actually works.
2235 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2236                                         MachineRegisterInfo &MRI,
2237                                         MachineIRBuilder &B,
2238                                         bool Signed) const {
2239 
2240   Register Dst = MI.getOperand(0).getReg();
2241   Register Src = MI.getOperand(1).getReg();
2242 
2243   const LLT S64 = LLT::scalar(64);
2244   const LLT S32 = LLT::scalar(32);
2245 
2246   const LLT SrcLT = MRI.getType(Src);
2247   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2248 
2249   unsigned Flags = MI.getFlags();
2250 
2251   // The basic idea of converting a floating point number into a pair of 32-bit
2252   // integers is illustrated as follows:
2253   //
2254   //     tf := trunc(val);
2255   //    hif := floor(tf * 2^-32);
2256   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2257   //     hi := fptoi(hif);
2258   //     lo := fptoi(lof);
2259   //
2260   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2261   MachineInstrBuilder Sign;
2262   if (Signed && SrcLT == S32) {
2263     // However, a 32-bit floating point number has only 23 bits mantissa and
2264     // it's not enough to hold all the significant bits of `lof` if val is
2265     // negative. To avoid the loss of precision, We need to take the absolute
2266     // value after truncating and flip the result back based on the original
2267     // signedness.
2268     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2269     Trunc = B.buildFAbs(S32, Trunc, Flags);
2270   }
2271   MachineInstrBuilder K0, K1;
2272   if (SrcLT == S64) {
2273     K0 = B.buildFConstant(S64,
2274                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2275     K1 = B.buildFConstant(S64,
2276                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2277   } else {
2278     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2279     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2280   }
2281 
2282   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2283   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2284   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2285 
2286   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2287                                      : B.buildFPTOUI(S32, FloorMul);
2288   auto Lo = B.buildFPTOUI(S32, Fma);
2289 
2290   if (Signed && SrcLT == S32) {
2291     // Flip the result based on the signedness, which is either all 0s or 1s.
2292     Sign = B.buildMerge(S64, {Sign, Sign});
2293     // r := xor({lo, hi}, sign) - sign;
2294     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2295   } else
2296     B.buildMerge(Dst, {Lo, Hi});
2297   MI.eraseFromParent();
2298 
2299   return true;
2300 }
2301 
2302 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2303                                                MachineInstr &MI) const {
2304   MachineFunction &MF = Helper.MIRBuilder.getMF();
2305   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2306 
2307   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2308                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2309 
2310   // With ieee_mode disabled, the instructions have the correct behavior
2311   // already for G_FMINNUM/G_FMAXNUM
2312   if (!MFI->getMode().IEEE)
2313     return !IsIEEEOp;
2314 
2315   if (IsIEEEOp)
2316     return true;
2317 
2318   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2319 }
2320 
2321 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2322   MachineInstr &MI, MachineRegisterInfo &MRI,
2323   MachineIRBuilder &B) const {
2324   // TODO: Should move some of this into LegalizerHelper.
2325 
2326   // TODO: Promote dynamic indexing of s16 to s32
2327 
2328   // FIXME: Artifact combiner probably should have replaced the truncated
2329   // constant before this, so we shouldn't need
2330   // getIConstantVRegValWithLookThrough.
2331   Optional<ValueAndVReg> MaybeIdxVal =
2332       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2333   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2334     return true;
2335   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2336 
2337   Register Dst = MI.getOperand(0).getReg();
2338   Register Vec = MI.getOperand(1).getReg();
2339 
2340   LLT VecTy = MRI.getType(Vec);
2341   LLT EltTy = VecTy.getElementType();
2342   assert(EltTy == MRI.getType(Dst));
2343 
2344   if (IdxVal < VecTy.getNumElements()) {
2345     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2346     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2347   } else {
2348     B.buildUndef(Dst);
2349   }
2350 
2351   MI.eraseFromParent();
2352   return true;
2353 }
2354 
2355 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2356   MachineInstr &MI, MachineRegisterInfo &MRI,
2357   MachineIRBuilder &B) const {
2358   // TODO: Should move some of this into LegalizerHelper.
2359 
2360   // TODO: Promote dynamic indexing of s16 to s32
2361 
2362   // FIXME: Artifact combiner probably should have replaced the truncated
2363   // constant before this, so we shouldn't need
2364   // getIConstantVRegValWithLookThrough.
2365   Optional<ValueAndVReg> MaybeIdxVal =
2366       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2367   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2368     return true;
2369 
2370   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2371   Register Dst = MI.getOperand(0).getReg();
2372   Register Vec = MI.getOperand(1).getReg();
2373   Register Ins = MI.getOperand(2).getReg();
2374 
2375   LLT VecTy = MRI.getType(Vec);
2376   LLT EltTy = VecTy.getElementType();
2377   assert(EltTy == MRI.getType(Ins));
2378   (void)Ins;
2379 
2380   unsigned NumElts = VecTy.getNumElements();
2381   if (IdxVal < NumElts) {
2382     SmallVector<Register, 8> SrcRegs;
2383     for (unsigned i = 0; i < NumElts; ++i)
2384       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2385     B.buildUnmerge(SrcRegs, Vec);
2386 
2387     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2388     B.buildMerge(Dst, SrcRegs);
2389   } else {
2390     B.buildUndef(Dst);
2391   }
2392 
2393   MI.eraseFromParent();
2394   return true;
2395 }
2396 
2397 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2398   MachineInstr &MI, MachineRegisterInfo &MRI,
2399   MachineIRBuilder &B) const {
2400   const LLT V2S16 = LLT::fixed_vector(2, 16);
2401 
2402   Register Dst = MI.getOperand(0).getReg();
2403   Register Src0 = MI.getOperand(1).getReg();
2404   LLT DstTy = MRI.getType(Dst);
2405   LLT SrcTy = MRI.getType(Src0);
2406 
2407   if (SrcTy == V2S16 && DstTy == V2S16 &&
2408       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2409     return true;
2410 
2411   MachineIRBuilder HelperBuilder(MI);
2412   GISelObserverWrapper DummyObserver;
2413   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2414   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2415 }
2416 
2417 bool AMDGPULegalizerInfo::legalizeSinCos(
2418   MachineInstr &MI, MachineRegisterInfo &MRI,
2419   MachineIRBuilder &B) const {
2420 
2421   Register DstReg = MI.getOperand(0).getReg();
2422   Register SrcReg = MI.getOperand(1).getReg();
2423   LLT Ty = MRI.getType(DstReg);
2424   unsigned Flags = MI.getFlags();
2425 
2426   Register TrigVal;
2427   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2428   if (ST.hasTrigReducedRange()) {
2429     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2430     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2431       .addUse(MulVal.getReg(0))
2432       .setMIFlags(Flags).getReg(0);
2433   } else
2434     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2435 
2436   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2437     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2438   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2439     .addUse(TrigVal)
2440     .setMIFlags(Flags);
2441   MI.eraseFromParent();
2442   return true;
2443 }
2444 
2445 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2446                                                   MachineIRBuilder &B,
2447                                                   const GlobalValue *GV,
2448                                                   int64_t Offset,
2449                                                   unsigned GAFlags) const {
2450   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2451   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2452   // to the following code sequence:
2453   //
2454   // For constant address space:
2455   //   s_getpc_b64 s[0:1]
2456   //   s_add_u32 s0, s0, $symbol
2457   //   s_addc_u32 s1, s1, 0
2458   //
2459   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2460   //   a fixup or relocation is emitted to replace $symbol with a literal
2461   //   constant, which is a pc-relative offset from the encoding of the $symbol
2462   //   operand to the global variable.
2463   //
2464   // For global address space:
2465   //   s_getpc_b64 s[0:1]
2466   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2467   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2468   //
2469   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2470   //   fixups or relocations are emitted to replace $symbol@*@lo and
2471   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2472   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2473   //   operand to the global variable.
2474   //
2475   // What we want here is an offset from the value returned by s_getpc
2476   // (which is the address of the s_add_u32 instruction) to the global
2477   // variable, but since the encoding of $symbol starts 4 bytes after the start
2478   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2479   // small. This requires us to add 4 to the global variable offset in order to
2480   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2481   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2482   // instruction.
2483 
2484   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2485 
2486   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2487     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2488 
2489   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2490     .addDef(PCReg);
2491 
2492   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2493   if (GAFlags == SIInstrInfo::MO_NONE)
2494     MIB.addImm(0);
2495   else
2496     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2497 
2498   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2499 
2500   if (PtrTy.getSizeInBits() == 32)
2501     B.buildExtract(DstReg, PCReg, 0);
2502   return true;
2503  }
2504 
2505 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2506   MachineInstr &MI, MachineRegisterInfo &MRI,
2507   MachineIRBuilder &B) const {
2508   Register DstReg = MI.getOperand(0).getReg();
2509   LLT Ty = MRI.getType(DstReg);
2510   unsigned AS = Ty.getAddressSpace();
2511 
2512   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2513   MachineFunction &MF = B.getMF();
2514   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2515 
2516   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2517     if (!MFI->isModuleEntryFunction() &&
2518         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2519       const Function &Fn = MF.getFunction();
2520       DiagnosticInfoUnsupported BadLDSDecl(
2521         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2522         DS_Warning);
2523       Fn.getContext().diagnose(BadLDSDecl);
2524 
2525       // We currently don't have a way to correctly allocate LDS objects that
2526       // aren't directly associated with a kernel. We do force inlining of
2527       // functions that use local objects. However, if these dead functions are
2528       // not eliminated, we don't want a compile time error. Just emit a warning
2529       // and a trap, since there should be no callable path here.
2530       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2531       B.buildUndef(DstReg);
2532       MI.eraseFromParent();
2533       return true;
2534     }
2535 
2536     // TODO: We could emit code to handle the initialization somewhere.
2537     // We ignore the initializer for now and legalize it to allow selection.
2538     // The initializer will anyway get errored out during assembly emission.
2539     const SITargetLowering *TLI = ST.getTargetLowering();
2540     if (!TLI->shouldUseLDSConstAddress(GV)) {
2541       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2542       return true; // Leave in place;
2543     }
2544 
2545     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2546       Type *Ty = GV->getValueType();
2547       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2548       // zero-sized type in other languages to declare the dynamic shared
2549       // memory which size is not known at the compile time. They will be
2550       // allocated by the runtime and placed directly after the static
2551       // allocated ones. They all share the same offset.
2552       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2553         // Adjust alignment for that dynamic shared memory array.
2554         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2555         LLT S32 = LLT::scalar(32);
2556         auto Sz =
2557             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2558         B.buildIntToPtr(DstReg, Sz);
2559         MI.eraseFromParent();
2560         return true;
2561       }
2562     }
2563 
2564     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2565                                                    *cast<GlobalVariable>(GV)));
2566     MI.eraseFromParent();
2567     return true;
2568   }
2569 
2570   const SITargetLowering *TLI = ST.getTargetLowering();
2571 
2572   if (TLI->shouldEmitFixup(GV)) {
2573     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2574     MI.eraseFromParent();
2575     return true;
2576   }
2577 
2578   if (TLI->shouldEmitPCReloc(GV)) {
2579     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2580     MI.eraseFromParent();
2581     return true;
2582   }
2583 
2584   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2585   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2586 
2587   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2588   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2589       MachinePointerInfo::getGOT(MF),
2590       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2591           MachineMemOperand::MOInvariant,
2592       LoadTy, Align(8));
2593 
2594   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2595 
2596   if (Ty.getSizeInBits() == 32) {
2597     // Truncate if this is a 32-bit constant address.
2598     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2599     B.buildExtract(DstReg, Load, 0);
2600   } else
2601     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2602 
2603   MI.eraseFromParent();
2604   return true;
2605 }
2606 
2607 static LLT widenToNextPowerOf2(LLT Ty) {
2608   if (Ty.isVector())
2609     return Ty.changeElementCount(
2610         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2611   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2612 }
2613 
2614 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2615                                        MachineInstr &MI) const {
2616   MachineIRBuilder &B = Helper.MIRBuilder;
2617   MachineRegisterInfo &MRI = *B.getMRI();
2618   GISelChangeObserver &Observer = Helper.Observer;
2619 
2620   Register PtrReg = MI.getOperand(1).getReg();
2621   LLT PtrTy = MRI.getType(PtrReg);
2622   unsigned AddrSpace = PtrTy.getAddressSpace();
2623 
2624   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2625     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2626     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2627     Observer.changingInstr(MI);
2628     MI.getOperand(1).setReg(Cast.getReg(0));
2629     Observer.changedInstr(MI);
2630     return true;
2631   }
2632 
2633   if (MI.getOpcode() != AMDGPU::G_LOAD)
2634     return false;
2635 
2636   Register ValReg = MI.getOperand(0).getReg();
2637   LLT ValTy = MRI.getType(ValReg);
2638 
2639   MachineMemOperand *MMO = *MI.memoperands_begin();
2640   const unsigned ValSize = ValTy.getSizeInBits();
2641   const LLT MemTy = MMO->getMemoryType();
2642   const Align MemAlign = MMO->getAlign();
2643   const unsigned MemSize = MemTy.getSizeInBits();
2644   const uint64_t AlignInBits = 8 * MemAlign.value();
2645 
2646   // Widen non-power-of-2 loads to the alignment if needed
2647   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2648     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2649 
2650     // This was already the correct extending load result type, so just adjust
2651     // the memory type.
2652     if (WideMemSize == ValSize) {
2653       MachineFunction &MF = B.getMF();
2654 
2655       MachineMemOperand *WideMMO =
2656           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2657       Observer.changingInstr(MI);
2658       MI.setMemRefs(MF, {WideMMO});
2659       Observer.changedInstr(MI);
2660       return true;
2661     }
2662 
2663     // Don't bother handling edge case that should probably never be produced.
2664     if (ValSize > WideMemSize)
2665       return false;
2666 
2667     LLT WideTy = widenToNextPowerOf2(ValTy);
2668 
2669     Register WideLoad;
2670     if (!WideTy.isVector()) {
2671       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2672       B.buildTrunc(ValReg, WideLoad).getReg(0);
2673     } else {
2674       // Extract the subvector.
2675 
2676       if (isRegisterType(ValTy)) {
2677         // If this a case where G_EXTRACT is legal, use it.
2678         // (e.g. <3 x s32> -> <4 x s32>)
2679         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2680         B.buildExtract(ValReg, WideLoad, 0);
2681       } else {
2682         // For cases where the widened type isn't a nice register value, unmerge
2683         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2684         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2685         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2686       }
2687     }
2688 
2689     MI.eraseFromParent();
2690     return true;
2691   }
2692 
2693   return false;
2694 }
2695 
2696 bool AMDGPULegalizerInfo::legalizeFMad(
2697   MachineInstr &MI, MachineRegisterInfo &MRI,
2698   MachineIRBuilder &B) const {
2699   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2700   assert(Ty.isScalar());
2701 
2702   MachineFunction &MF = B.getMF();
2703   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2704 
2705   // TODO: Always legal with future ftz flag.
2706   // FIXME: Do we need just output?
2707   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2708     return true;
2709   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2710     return true;
2711 
2712   MachineIRBuilder HelperBuilder(MI);
2713   GISelObserverWrapper DummyObserver;
2714   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2715   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2716 }
2717 
2718 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2719   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2720   Register DstReg = MI.getOperand(0).getReg();
2721   Register PtrReg = MI.getOperand(1).getReg();
2722   Register CmpVal = MI.getOperand(2).getReg();
2723   Register NewVal = MI.getOperand(3).getReg();
2724 
2725   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2726          "this should not have been custom lowered");
2727 
2728   LLT ValTy = MRI.getType(CmpVal);
2729   LLT VecTy = LLT::fixed_vector(2, ValTy);
2730 
2731   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2732 
2733   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2734     .addDef(DstReg)
2735     .addUse(PtrReg)
2736     .addUse(PackedVal)
2737     .setMemRefs(MI.memoperands());
2738 
2739   MI.eraseFromParent();
2740   return true;
2741 }
2742 
2743 bool AMDGPULegalizerInfo::legalizeFlog(
2744   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2745   Register Dst = MI.getOperand(0).getReg();
2746   Register Src = MI.getOperand(1).getReg();
2747   LLT Ty = B.getMRI()->getType(Dst);
2748   unsigned Flags = MI.getFlags();
2749 
2750   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2751   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2752 
2753   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2754   MI.eraseFromParent();
2755   return true;
2756 }
2757 
2758 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2759                                        MachineIRBuilder &B) const {
2760   Register Dst = MI.getOperand(0).getReg();
2761   Register Src = MI.getOperand(1).getReg();
2762   unsigned Flags = MI.getFlags();
2763   LLT Ty = B.getMRI()->getType(Dst);
2764 
2765   auto K = B.buildFConstant(Ty, numbers::log2e);
2766   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2767   B.buildFExp2(Dst, Mul, Flags);
2768   MI.eraseFromParent();
2769   return true;
2770 }
2771 
2772 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2773                                        MachineIRBuilder &B) const {
2774   Register Dst = MI.getOperand(0).getReg();
2775   Register Src0 = MI.getOperand(1).getReg();
2776   Register Src1 = MI.getOperand(2).getReg();
2777   unsigned Flags = MI.getFlags();
2778   LLT Ty = B.getMRI()->getType(Dst);
2779   const LLT S16 = LLT::scalar(16);
2780   const LLT S32 = LLT::scalar(32);
2781 
2782   if (Ty == S32) {
2783     auto Log = B.buildFLog2(S32, Src0, Flags);
2784     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2785       .addUse(Log.getReg(0))
2786       .addUse(Src1)
2787       .setMIFlags(Flags);
2788     B.buildFExp2(Dst, Mul, Flags);
2789   } else if (Ty == S16) {
2790     // There's no f16 fmul_legacy, so we need to convert for it.
2791     auto Log = B.buildFLog2(S16, Src0, Flags);
2792     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2793     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2794     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2795       .addUse(Ext0.getReg(0))
2796       .addUse(Ext1.getReg(0))
2797       .setMIFlags(Flags);
2798 
2799     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2800   } else
2801     return false;
2802 
2803   MI.eraseFromParent();
2804   return true;
2805 }
2806 
2807 // Find a source register, ignoring any possible source modifiers.
2808 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2809   Register ModSrc = OrigSrc;
2810   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2811     ModSrc = SrcFNeg->getOperand(1).getReg();
2812     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2813       ModSrc = SrcFAbs->getOperand(1).getReg();
2814   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2815     ModSrc = SrcFAbs->getOperand(1).getReg();
2816   return ModSrc;
2817 }
2818 
2819 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2820                                          MachineRegisterInfo &MRI,
2821                                          MachineIRBuilder &B) const {
2822 
2823   const LLT S1 = LLT::scalar(1);
2824   const LLT S64 = LLT::scalar(64);
2825   Register Dst = MI.getOperand(0).getReg();
2826   Register OrigSrc = MI.getOperand(1).getReg();
2827   unsigned Flags = MI.getFlags();
2828   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2829          "this should not have been custom lowered");
2830 
2831   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2832   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2833   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2834   // V_FRACT bug is:
2835   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2836   //
2837   // Convert floor(x) to (x - fract(x))
2838 
2839   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2840     .addUse(OrigSrc)
2841     .setMIFlags(Flags);
2842 
2843   // Give source modifier matching some assistance before obscuring a foldable
2844   // pattern.
2845 
2846   // TODO: We can avoid the neg on the fract? The input sign to fract
2847   // shouldn't matter?
2848   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2849 
2850   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2851 
2852   Register Min = MRI.createGenericVirtualRegister(S64);
2853 
2854   // We don't need to concern ourselves with the snan handling difference, so
2855   // use the one which will directly select.
2856   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2857   if (MFI->getMode().IEEE)
2858     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2859   else
2860     B.buildFMinNum(Min, Fract, Const, Flags);
2861 
2862   Register CorrectedFract = Min;
2863   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2864     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2865     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2866   }
2867 
2868   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2869   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2870 
2871   MI.eraseFromParent();
2872   return true;
2873 }
2874 
2875 // Turn an illegal packed v2s16 build vector into bit operations.
2876 // TODO: This should probably be a bitcast action in LegalizerHelper.
2877 bool AMDGPULegalizerInfo::legalizeBuildVector(
2878   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2879   Register Dst = MI.getOperand(0).getReg();
2880   const LLT S32 = LLT::scalar(32);
2881   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2882 
2883   Register Src0 = MI.getOperand(1).getReg();
2884   Register Src1 = MI.getOperand(2).getReg();
2885   assert(MRI.getType(Src0) == LLT::scalar(16));
2886 
2887   auto Merge = B.buildMerge(S32, {Src0, Src1});
2888   B.buildBitcast(Dst, Merge);
2889 
2890   MI.eraseFromParent();
2891   return true;
2892 }
2893 
2894 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2895 //
2896 // Source and accumulation registers must all be 32-bits.
2897 //
2898 // TODO: When the multiply is uniform, we should produce a code sequence
2899 // that is better suited to instruction selection on the SALU. Instead of
2900 // the outer loop going over parts of the result, the outer loop should go
2901 // over parts of one of the factors. This should result in instruction
2902 // selection that makes full use of S_ADDC_U32 instructions.
2903 void AMDGPULegalizerInfo::buildMultiply(
2904     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2905     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2906     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2907   // Use (possibly empty) vectors of S1 registers to represent the set of
2908   // carries from one pair of positions to the next.
2909   using Carry = SmallVector<Register, 2>;
2910 
2911   MachineIRBuilder &B = Helper.MIRBuilder;
2912 
2913   const LLT S1 = LLT::scalar(1);
2914   const LLT S32 = LLT::scalar(32);
2915   const LLT S64 = LLT::scalar(64);
2916 
2917   Register Zero32;
2918   Register Zero64;
2919 
2920   auto getZero32 = [&]() -> Register {
2921     if (!Zero32)
2922       Zero32 = B.buildConstant(S32, 0).getReg(0);
2923     return Zero32;
2924   };
2925   auto getZero64 = [&]() -> Register {
2926     if (!Zero64)
2927       Zero64 = B.buildConstant(S64, 0).getReg(0);
2928     return Zero64;
2929   };
2930 
2931   // Merge the given carries into the 32-bit LocalAccum, which is modified
2932   // in-place.
2933   //
2934   // Returns the carry-out, which is a single S1 register or null.
2935   auto mergeCarry =
2936       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2937         if (CarryIn.empty())
2938           return Register();
2939 
2940         bool HaveCarryOut = true;
2941         Register CarryAccum;
2942         if (CarryIn.size() == 1) {
2943           if (!LocalAccum) {
2944             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2945             return Register();
2946           }
2947 
2948           CarryAccum = getZero32();
2949         } else {
2950           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2951           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2952             CarryAccum =
2953                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2954                     .getReg(0);
2955           }
2956 
2957           if (!LocalAccum) {
2958             LocalAccum = getZero32();
2959             HaveCarryOut = false;
2960           }
2961         }
2962 
2963         auto Add =
2964             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2965         LocalAccum = Add.getReg(0);
2966         return HaveCarryOut ? Add.getReg(1) : Register();
2967       };
2968 
2969   // Build a multiply-add chain to compute
2970   //
2971   //   LocalAccum + (partial products at DstIndex)
2972   //       + (opportunistic subset of CarryIn)
2973   //
2974   // LocalAccum is an array of one or two 32-bit registers that are updated
2975   // in-place. The incoming registers may be null.
2976   //
2977   // In some edge cases, carry-ins can be consumed "for free". In that case,
2978   // the consumed carry bits are removed from CarryIn in-place.
2979   auto buildMadChain =
2980       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2981           -> Carry {
2982         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2983                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2984 
2985         Register Tmp;
2986         Carry CarryOut;
2987         unsigned j0 = 0;
2988 
2989         // Use plain 32-bit multiplication for the most significant part of the
2990         // result by default.
2991         if (LocalAccum.size() == 1 &&
2992             (!UsePartialMad64_32 || !CarryIn.empty())) {
2993           do {
2994             unsigned j1 = DstIndex - j0;
2995             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
2996             if (!LocalAccum[0]) {
2997               LocalAccum[0] = Mul.getReg(0);
2998             } else {
2999               if (CarryIn.empty()) {
3000                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3001               } else {
3002                 LocalAccum[0] =
3003                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3004                         .getReg(0);
3005                 CarryIn.pop_back();
3006               }
3007             }
3008             ++j0;
3009           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3010         }
3011 
3012         // Build full 64-bit multiplies.
3013         if (j0 <= DstIndex) {
3014           bool HaveSmallAccum = false;
3015           Register Tmp;
3016 
3017           if (LocalAccum[0]) {
3018             if (LocalAccum.size() == 1) {
3019               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3020               HaveSmallAccum = true;
3021             } else if (LocalAccum[1]) {
3022               Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3023               HaveSmallAccum = false;
3024             } else {
3025               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3026               HaveSmallAccum = true;
3027             }
3028           } else {
3029             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3030             Tmp = getZero64();
3031             HaveSmallAccum = true;
3032           }
3033 
3034           do {
3035             unsigned j1 = DstIndex - j0;
3036             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3037                                     {Src0[j0], Src1[j1], Tmp});
3038             Tmp = Mad.getReg(0);
3039             if (!HaveSmallAccum)
3040               CarryOut.push_back(Mad.getReg(1));
3041             HaveSmallAccum = false;
3042             ++j0;
3043           } while (j0 <= DstIndex);
3044 
3045           auto Unmerge = B.buildUnmerge(S32, Tmp);
3046           LocalAccum[0] = Unmerge.getReg(0);
3047           if (LocalAccum.size() > 1)
3048             LocalAccum[1] = Unmerge.getReg(1);
3049         }
3050 
3051         return CarryOut;
3052       };
3053 
3054   // Outer multiply loop, iterating over destination parts from least
3055   // significant to most significant parts.
3056   //
3057   // The columns of the following diagram correspond to the destination parts
3058   // affected by one iteration of the outer loop (ignoring boundary
3059   // conditions).
3060   //
3061   //   Dest index relative to 2 * i:      1 0 -1
3062   //                                      ------
3063   //   Carries from previous iteration:     e o
3064   //   Even-aligned partial product sum:  E E .
3065   //   Odd-aligned partial product sum:     O O
3066   //
3067   // 'o' is OddCarry, 'e' is EvenCarry.
3068   // EE and OO are computed from partial products via buildMadChain and use
3069   // accumulation where possible and appropriate.
3070   //
3071   Register SeparateOddCarry;
3072   Carry EvenCarry;
3073   Carry OddCarry;
3074 
3075   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3076     Carry OddCarryIn = std::move(OddCarry);
3077     Carry EvenCarryIn = std::move(EvenCarry);
3078     OddCarry.clear();
3079     EvenCarry.clear();
3080 
3081     // Partial products at offset 2 * i.
3082     if (2 * i < Accum.size()) {
3083       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3084       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3085     }
3086 
3087     // Partial products at offset 2 * i - 1.
3088     if (i > 0) {
3089       if (!SeparateOddAlignedProducts) {
3090         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3091         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3092       } else {
3093         bool IsHighest = 2 * i >= Accum.size();
3094         Register SeparateOddOut[2];
3095         auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3096                               .take_front(IsHighest ? 1 : 2);
3097         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3098 
3099         MachineInstr *Lo;
3100 
3101         if (i == 1) {
3102           if (!IsHighest)
3103             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3104           else
3105             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3106         } else {
3107           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3108                             SeparateOddCarry);
3109         }
3110         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3111 
3112         if (!IsHighest) {
3113           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3114                                 Lo->getOperand(1).getReg());
3115           Accum[2 * i] = Hi.getReg(0);
3116           SeparateOddCarry = Hi.getReg(1);
3117         }
3118       }
3119     }
3120 
3121     // Add in the carries from the previous iteration
3122     if (i > 0) {
3123       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3124         EvenCarryIn.push_back(CarryOut);
3125 
3126       if (2 * i < Accum.size()) {
3127         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3128           OddCarry.push_back(CarryOut);
3129       }
3130     }
3131   }
3132 }
3133 
3134 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3135 //
3136 // TODO: If the multiply is followed by an addition, we should attempt to
3137 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3138 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3139                                       MachineInstr &MI) const {
3140   assert(ST.hasMad64_32());
3141   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3142 
3143   MachineIRBuilder &B = Helper.MIRBuilder;
3144   MachineRegisterInfo &MRI = *B.getMRI();
3145 
3146   Register DstReg = MI.getOperand(0).getReg();
3147   Register Src0 = MI.getOperand(1).getReg();
3148   Register Src1 = MI.getOperand(2).getReg();
3149 
3150   LLT Ty = MRI.getType(DstReg);
3151   assert(Ty.isScalar());
3152 
3153   unsigned Size = Ty.getSizeInBits();
3154   unsigned NumParts = Size / 32;
3155   assert((Size % 32) == 0);
3156   assert(NumParts >= 2);
3157 
3158   // Whether to use MAD_64_32 for partial products whose high half is
3159   // discarded. This avoids some ADD instructions but risks false dependency
3160   // stalls on some subtargets in some cases.
3161   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3162 
3163   // Whether to compute odd-aligned partial products separately. This is
3164   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3165   // in an even-aligned VGPR.
3166   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3167 
3168   LLT S32 = LLT::scalar(32);
3169   SmallVector<Register, 2> Src0Parts, Src1Parts;
3170   for (unsigned i = 0; i < NumParts; ++i) {
3171     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3172     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3173   }
3174   B.buildUnmerge(Src0Parts, Src0);
3175   B.buildUnmerge(Src1Parts, Src1);
3176 
3177   SmallVector<Register, 2> AccumRegs(NumParts);
3178   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3179                 SeparateOddAlignedProducts);
3180 
3181   B.buildMerge(DstReg, AccumRegs);
3182   MI.eraseFromParent();
3183   return true;
3184 
3185 }
3186 
3187 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3188 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3189 // case with a single min instruction instead of a compare+select.
3190 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3191                                             MachineRegisterInfo &MRI,
3192                                             MachineIRBuilder &B) const {
3193   Register Dst = MI.getOperand(0).getReg();
3194   Register Src = MI.getOperand(1).getReg();
3195   LLT DstTy = MRI.getType(Dst);
3196   LLT SrcTy = MRI.getType(Src);
3197 
3198   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3199                         ? AMDGPU::G_AMDGPU_FFBH_U32
3200                         : AMDGPU::G_AMDGPU_FFBL_B32;
3201   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3202   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3203 
3204   MI.eraseFromParent();
3205   return true;
3206 }
3207 
3208 // Check that this is a G_XOR x, -1
3209 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3210   if (MI.getOpcode() != TargetOpcode::G_XOR)
3211     return false;
3212   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3213   return ConstVal && *ConstVal == -1;
3214 }
3215 
3216 // Return the use branch instruction, otherwise null if the usage is invalid.
3217 static MachineInstr *
3218 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3219                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3220   Register CondDef = MI.getOperand(0).getReg();
3221   if (!MRI.hasOneNonDBGUse(CondDef))
3222     return nullptr;
3223 
3224   MachineBasicBlock *Parent = MI.getParent();
3225   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3226 
3227   if (isNot(MRI, *UseMI)) {
3228     Register NegatedCond = UseMI->getOperand(0).getReg();
3229     if (!MRI.hasOneNonDBGUse(NegatedCond))
3230       return nullptr;
3231 
3232     // We're deleting the def of this value, so we need to remove it.
3233     eraseInstr(*UseMI, MRI);
3234 
3235     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3236     Negated = true;
3237   }
3238 
3239   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3240     return nullptr;
3241 
3242   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3243   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3244   if (Next == Parent->end()) {
3245     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3246     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3247       return nullptr;
3248     UncondBrTarget = &*NextMBB;
3249   } else {
3250     if (Next->getOpcode() != AMDGPU::G_BR)
3251       return nullptr;
3252     Br = &*Next;
3253     UncondBrTarget = Br->getOperand(0).getMBB();
3254   }
3255 
3256   return UseMI;
3257 }
3258 
3259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3260                                          const ArgDescriptor *Arg,
3261                                          const TargetRegisterClass *ArgRC,
3262                                          LLT ArgTy) const {
3263   MCRegister SrcReg = Arg->getRegister();
3264   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3265   assert(DstReg.isVirtual() && "Virtual register expected");
3266 
3267   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3268                                              *ArgRC, B.getDebugLoc(), ArgTy);
3269   if (Arg->isMasked()) {
3270     // TODO: Should we try to emit this once in the entry block?
3271     const LLT S32 = LLT::scalar(32);
3272     const unsigned Mask = Arg->getMask();
3273     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3274 
3275     Register AndMaskSrc = LiveIn;
3276 
3277     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3278     // 0.
3279     if (Shift != 0) {
3280       auto ShiftAmt = B.buildConstant(S32, Shift);
3281       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3282     }
3283 
3284     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3285   } else {
3286     B.buildCopy(DstReg, LiveIn);
3287   }
3288 
3289   return true;
3290 }
3291 
3292 bool AMDGPULegalizerInfo::loadInputValue(
3293     Register DstReg, MachineIRBuilder &B,
3294     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3295   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3296   const ArgDescriptor *Arg;
3297   const TargetRegisterClass *ArgRC;
3298   LLT ArgTy;
3299   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3300 
3301   if (!Arg) {
3302     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3303       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3304       // case the pointer argument may be missing and we use null.
3305       B.buildConstant(DstReg, 0);
3306       return true;
3307     }
3308 
3309     // It's undefined behavior if a function marked with the amdgpu-no-*
3310     // attributes uses the corresponding intrinsic.
3311     B.buildUndef(DstReg);
3312     return true;
3313   }
3314 
3315   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3316     return false; // TODO: Handle these
3317   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3318 }
3319 
3320 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3321     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3322     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3323   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3324     return false;
3325 
3326   MI.eraseFromParent();
3327   return true;
3328 }
3329 
3330 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3331                                 int64_t C) {
3332   B.buildConstant(MI.getOperand(0).getReg(), C);
3333   MI.eraseFromParent();
3334   return true;
3335 }
3336 
3337 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3338     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3339     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3340   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3341   if (MaxID == 0)
3342     return replaceWithConstant(B, MI, 0);
3343 
3344   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3345   const ArgDescriptor *Arg;
3346   const TargetRegisterClass *ArgRC;
3347   LLT ArgTy;
3348   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3349 
3350   Register DstReg = MI.getOperand(0).getReg();
3351   if (!Arg) {
3352     // It's undefined behavior if a function marked with the amdgpu-no-*
3353     // attributes uses the corresponding intrinsic.
3354     B.buildUndef(DstReg);
3355     MI.eraseFromParent();
3356     return true;
3357   }
3358 
3359   if (Arg->isMasked()) {
3360     // Don't bother inserting AssertZext for packed IDs since we're emitting the
3361     // masking operations anyway.
3362     //
3363     // TODO: We could assert the top bit is 0 for the source copy.
3364     if (!loadInputValue(DstReg, B, ArgType))
3365       return false;
3366   } else {
3367     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3368     if (!loadInputValue(TmpReg, B, ArgType))
3369       return false;
3370     B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3371   }
3372 
3373   MI.eraseFromParent();
3374   return true;
3375 }
3376 
3377 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3378                                                      int64_t Offset) const {
3379   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3380   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3381 
3382   // TODO: If we passed in the base kernel offset we could have a better
3383   // alignment than 4, but we don't really need it.
3384   if (!loadInputValue(KernArgReg, B,
3385                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3386     llvm_unreachable("failed to find kernarg segment ptr");
3387 
3388   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3389   // TODO: Should get nuw
3390   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3391 }
3392 
3393 /// Legalize a value that's loaded from kernel arguments. This is only used by
3394 /// legacy intrinsics.
3395 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3396                                                       MachineIRBuilder &B,
3397                                                       uint64_t Offset,
3398                                                       Align Alignment) const {
3399   Register DstReg = MI.getOperand(0).getReg();
3400 
3401   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3402          "unexpected kernarg parameter type");
3403 
3404   Register Ptr = getKernargParameterPtr(B, Offset);
3405   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3406   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3407               MachineMemOperand::MODereferenceable |
3408                   MachineMemOperand::MOInvariant);
3409   MI.eraseFromParent();
3410   return true;
3411 }
3412 
3413 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3414                                        MachineRegisterInfo &MRI,
3415                                        MachineIRBuilder &B) const {
3416   Register Dst = MI.getOperand(0).getReg();
3417   LLT DstTy = MRI.getType(Dst);
3418   LLT S16 = LLT::scalar(16);
3419   LLT S32 = LLT::scalar(32);
3420   LLT S64 = LLT::scalar(64);
3421 
3422   if (DstTy == S16)
3423     return legalizeFDIV16(MI, MRI, B);
3424   if (DstTy == S32)
3425     return legalizeFDIV32(MI, MRI, B);
3426   if (DstTy == S64)
3427     return legalizeFDIV64(MI, MRI, B);
3428 
3429   return false;
3430 }
3431 
3432 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3433                                                         Register DstDivReg,
3434                                                         Register DstRemReg,
3435                                                         Register X,
3436                                                         Register Y) const {
3437   const LLT S1 = LLT::scalar(1);
3438   const LLT S32 = LLT::scalar(32);
3439 
3440   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3441   // algorithm used here.
3442 
3443   // Initial estimate of inv(y).
3444   auto FloatY = B.buildUITOFP(S32, Y);
3445   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3446   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3447   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3448   auto Z = B.buildFPTOUI(S32, ScaledY);
3449 
3450   // One round of UNR.
3451   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3452   auto NegYZ = B.buildMul(S32, NegY, Z);
3453   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3454 
3455   // Quotient/remainder estimate.
3456   auto Q = B.buildUMulH(S32, X, Z);
3457   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3458 
3459   // First quotient/remainder refinement.
3460   auto One = B.buildConstant(S32, 1);
3461   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3462   if (DstDivReg)
3463     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3464   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3465 
3466   // Second quotient/remainder refinement.
3467   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3468   if (DstDivReg)
3469     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3470 
3471   if (DstRemReg)
3472     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3473 }
3474 
3475 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3476 //
3477 // Return lo, hi of result
3478 //
3479 // %cvt.lo = G_UITOFP Val.lo
3480 // %cvt.hi = G_UITOFP Val.hi
3481 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3482 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3483 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3484 // %mul2 = G_FMUL %mul1, 2**(-32)
3485 // %trunc = G_INTRINSIC_TRUNC %mul2
3486 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3487 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3488 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3489                                                        Register Val) {
3490   const LLT S32 = LLT::scalar(32);
3491   auto Unmerge = B.buildUnmerge(S32, Val);
3492 
3493   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3494   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3495 
3496   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3497                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3498 
3499   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3500   auto Mul1 =
3501       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3502 
3503   // 2**(-32)
3504   auto Mul2 =
3505       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3506   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3507 
3508   // -(2**32)
3509   auto Mad2 = B.buildFMAD(S32, Trunc,
3510                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3511 
3512   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3513   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3514 
3515   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3516 }
3517 
3518 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3519                                                         Register DstDivReg,
3520                                                         Register DstRemReg,
3521                                                         Register Numer,
3522                                                         Register Denom) const {
3523   const LLT S32 = LLT::scalar(32);
3524   const LLT S64 = LLT::scalar(64);
3525   const LLT S1 = LLT::scalar(1);
3526   Register RcpLo, RcpHi;
3527 
3528   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3529 
3530   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3531 
3532   auto Zero64 = B.buildConstant(S64, 0);
3533   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3534 
3535   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3536   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3537 
3538   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3539   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3540   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3541 
3542   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3543   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3544   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3545 
3546   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3547   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3548   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3549   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3550   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3551 
3552   auto Zero32 = B.buildConstant(S32, 0);
3553   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3554   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3555   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3556 
3557   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3558   Register NumerLo = UnmergeNumer.getReg(0);
3559   Register NumerHi = UnmergeNumer.getReg(1);
3560 
3561   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3562   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3563   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3564   Register Mul3_Lo = UnmergeMul3.getReg(0);
3565   Register Mul3_Hi = UnmergeMul3.getReg(1);
3566   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3567   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3568   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3569   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3570 
3571   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3572   Register DenomLo = UnmergeDenom.getReg(0);
3573   Register DenomHi = UnmergeDenom.getReg(1);
3574 
3575   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3576   auto C1 = B.buildSExt(S32, CmpHi);
3577 
3578   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3579   auto C2 = B.buildSExt(S32, CmpLo);
3580 
3581   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3582   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3583 
3584   // TODO: Here and below portions of the code can be enclosed into if/endif.
3585   // Currently control flow is unconditional and we have 4 selects after
3586   // potential endif to substitute PHIs.
3587 
3588   // if C3 != 0 ...
3589   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3590   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3591   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3592   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3593 
3594   auto One64 = B.buildConstant(S64, 1);
3595   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3596 
3597   auto C4 =
3598       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3599   auto C5 =
3600       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3601   auto C6 = B.buildSelect(
3602       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3603 
3604   // if (C6 != 0)
3605   auto Add4 = B.buildAdd(S64, Add3, One64);
3606   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3607 
3608   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3609   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3610   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3611 
3612   // endif C6
3613   // endif C3
3614 
3615   if (DstDivReg) {
3616     auto Sel1 = B.buildSelect(
3617         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3618     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3619                   Sel1, MulHi3);
3620   }
3621 
3622   if (DstRemReg) {
3623     auto Sel2 = B.buildSelect(
3624         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3625     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3626                   Sel2, Sub1);
3627   }
3628 }
3629 
3630 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3631                                                   MachineRegisterInfo &MRI,
3632                                                   MachineIRBuilder &B) const {
3633   Register DstDivReg, DstRemReg;
3634   switch (MI.getOpcode()) {
3635   default:
3636     llvm_unreachable("Unexpected opcode!");
3637   case AMDGPU::G_UDIV: {
3638     DstDivReg = MI.getOperand(0).getReg();
3639     break;
3640   }
3641   case AMDGPU::G_UREM: {
3642     DstRemReg = MI.getOperand(0).getReg();
3643     break;
3644   }
3645   case AMDGPU::G_UDIVREM: {
3646     DstDivReg = MI.getOperand(0).getReg();
3647     DstRemReg = MI.getOperand(1).getReg();
3648     break;
3649   }
3650   }
3651 
3652   const LLT S64 = LLT::scalar(64);
3653   const LLT S32 = LLT::scalar(32);
3654   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3655   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3656   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3657   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3658 
3659   if (Ty == S32)
3660     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3661   else if (Ty == S64)
3662     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3663   else
3664     return false;
3665 
3666   MI.eraseFromParent();
3667   return true;
3668 }
3669 
3670 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3671                                                 MachineRegisterInfo &MRI,
3672                                                 MachineIRBuilder &B) const {
3673   const LLT S64 = LLT::scalar(64);
3674   const LLT S32 = LLT::scalar(32);
3675 
3676   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3677   if (Ty != S32 && Ty != S64)
3678     return false;
3679 
3680   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3681   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3682   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3683 
3684   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3685   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3686   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3687 
3688   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3689   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3690 
3691   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3692   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3693 
3694   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3695   switch (MI.getOpcode()) {
3696   default:
3697     llvm_unreachable("Unexpected opcode!");
3698   case AMDGPU::G_SDIV: {
3699     DstDivReg = MI.getOperand(0).getReg();
3700     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3701     break;
3702   }
3703   case AMDGPU::G_SREM: {
3704     DstRemReg = MI.getOperand(0).getReg();
3705     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3706     break;
3707   }
3708   case AMDGPU::G_SDIVREM: {
3709     DstDivReg = MI.getOperand(0).getReg();
3710     DstRemReg = MI.getOperand(1).getReg();
3711     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3712     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3713     break;
3714   }
3715   }
3716 
3717   if (Ty == S32)
3718     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3719   else
3720     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3721 
3722   if (DstDivReg) {
3723     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3724     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3725     B.buildSub(DstDivReg, SignXor, Sign);
3726   }
3727 
3728   if (DstRemReg) {
3729     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3730     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3731     B.buildSub(DstRemReg, SignXor, Sign);
3732   }
3733 
3734   MI.eraseFromParent();
3735   return true;
3736 }
3737 
3738 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3739                                                  MachineRegisterInfo &MRI,
3740                                                  MachineIRBuilder &B) const {
3741   Register Res = MI.getOperand(0).getReg();
3742   Register LHS = MI.getOperand(1).getReg();
3743   Register RHS = MI.getOperand(2).getReg();
3744   uint16_t Flags = MI.getFlags();
3745   LLT ResTy = MRI.getType(Res);
3746 
3747   const MachineFunction &MF = B.getMF();
3748   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3749                             MI.getFlag(MachineInstr::FmAfn);
3750 
3751   if (!AllowInaccurateRcp)
3752     return false;
3753 
3754   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3755     // 1 / x -> RCP(x)
3756     if (CLHS->isExactlyValue(1.0)) {
3757       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3758         .addUse(RHS)
3759         .setMIFlags(Flags);
3760 
3761       MI.eraseFromParent();
3762       return true;
3763     }
3764 
3765     // -1 / x -> RCP( FNEG(x) )
3766     if (CLHS->isExactlyValue(-1.0)) {
3767       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3768       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3769         .addUse(FNeg.getReg(0))
3770         .setMIFlags(Flags);
3771 
3772       MI.eraseFromParent();
3773       return true;
3774     }
3775   }
3776 
3777   // x / y -> x * (1.0 / y)
3778   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3779     .addUse(RHS)
3780     .setMIFlags(Flags);
3781   B.buildFMul(Res, LHS, RCP, Flags);
3782 
3783   MI.eraseFromParent();
3784   return true;
3785 }
3786 
3787 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3788                                                    MachineRegisterInfo &MRI,
3789                                                    MachineIRBuilder &B) const {
3790   Register Res = MI.getOperand(0).getReg();
3791   Register X = MI.getOperand(1).getReg();
3792   Register Y = MI.getOperand(2).getReg();
3793   uint16_t Flags = MI.getFlags();
3794   LLT ResTy = MRI.getType(Res);
3795 
3796   const MachineFunction &MF = B.getMF();
3797   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3798                             MI.getFlag(MachineInstr::FmAfn);
3799 
3800   if (!AllowInaccurateRcp)
3801     return false;
3802 
3803   auto NegY = B.buildFNeg(ResTy, Y);
3804   auto One = B.buildFConstant(ResTy, 1.0);
3805 
3806   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3807     .addUse(Y)
3808     .setMIFlags(Flags);
3809 
3810   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3811   R = B.buildFMA(ResTy, Tmp0, R, R);
3812 
3813   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3814   R = B.buildFMA(ResTy, Tmp1, R, R);
3815 
3816   auto Ret = B.buildFMul(ResTy, X, R);
3817   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3818 
3819   B.buildFMA(Res, Tmp2, R, Ret);
3820   MI.eraseFromParent();
3821   return true;
3822 }
3823 
3824 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3825                                          MachineRegisterInfo &MRI,
3826                                          MachineIRBuilder &B) const {
3827   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3828     return true;
3829 
3830   Register Res = MI.getOperand(0).getReg();
3831   Register LHS = MI.getOperand(1).getReg();
3832   Register RHS = MI.getOperand(2).getReg();
3833 
3834   uint16_t Flags = MI.getFlags();
3835 
3836   LLT S16 = LLT::scalar(16);
3837   LLT S32 = LLT::scalar(32);
3838 
3839   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3840   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3841 
3842   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3843     .addUse(RHSExt.getReg(0))
3844     .setMIFlags(Flags);
3845 
3846   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3847   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3848 
3849   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3850     .addUse(RDst.getReg(0))
3851     .addUse(RHS)
3852     .addUse(LHS)
3853     .setMIFlags(Flags);
3854 
3855   MI.eraseFromParent();
3856   return true;
3857 }
3858 
3859 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3860 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3861 static void toggleSPDenormMode(bool Enable,
3862                                MachineIRBuilder &B,
3863                                const GCNSubtarget &ST,
3864                                AMDGPU::SIModeRegisterDefaults Mode) {
3865   // Set SP denorm mode to this value.
3866   unsigned SPDenormMode =
3867     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3868 
3869   if (ST.hasDenormModeInst()) {
3870     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3871     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3872 
3873     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3874     B.buildInstr(AMDGPU::S_DENORM_MODE)
3875       .addImm(NewDenormModeValue);
3876 
3877   } else {
3878     // Select FP32 bit field in mode register.
3879     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3880                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3881                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3882 
3883     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3884       .addImm(SPDenormMode)
3885       .addImm(SPDenormModeBitField);
3886   }
3887 }
3888 
3889 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3890                                          MachineRegisterInfo &MRI,
3891                                          MachineIRBuilder &B) const {
3892   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3893     return true;
3894 
3895   Register Res = MI.getOperand(0).getReg();
3896   Register LHS = MI.getOperand(1).getReg();
3897   Register RHS = MI.getOperand(2).getReg();
3898   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3899   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3900 
3901   uint16_t Flags = MI.getFlags();
3902 
3903   LLT S32 = LLT::scalar(32);
3904   LLT S1 = LLT::scalar(1);
3905 
3906   auto One = B.buildFConstant(S32, 1.0f);
3907 
3908   auto DenominatorScaled =
3909     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3910       .addUse(LHS)
3911       .addUse(RHS)
3912       .addImm(0)
3913       .setMIFlags(Flags);
3914   auto NumeratorScaled =
3915     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3916       .addUse(LHS)
3917       .addUse(RHS)
3918       .addImm(1)
3919       .setMIFlags(Flags);
3920 
3921   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3922     .addUse(DenominatorScaled.getReg(0))
3923     .setMIFlags(Flags);
3924   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3925 
3926   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3927   // aren't modeled as reading it.
3928   if (!Mode.allFP32Denormals())
3929     toggleSPDenormMode(true, B, ST, Mode);
3930 
3931   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3932   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3933   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3934   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3935   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3936   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3937 
3938   if (!Mode.allFP32Denormals())
3939     toggleSPDenormMode(false, B, ST, Mode);
3940 
3941   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3942     .addUse(Fma4.getReg(0))
3943     .addUse(Fma1.getReg(0))
3944     .addUse(Fma3.getReg(0))
3945     .addUse(NumeratorScaled.getReg(1))
3946     .setMIFlags(Flags);
3947 
3948   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3949     .addUse(Fmas.getReg(0))
3950     .addUse(RHS)
3951     .addUse(LHS)
3952     .setMIFlags(Flags);
3953 
3954   MI.eraseFromParent();
3955   return true;
3956 }
3957 
3958 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3959                                          MachineRegisterInfo &MRI,
3960                                          MachineIRBuilder &B) const {
3961   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3962     return true;
3963 
3964   Register Res = MI.getOperand(0).getReg();
3965   Register LHS = MI.getOperand(1).getReg();
3966   Register RHS = MI.getOperand(2).getReg();
3967 
3968   uint16_t Flags = MI.getFlags();
3969 
3970   LLT S64 = LLT::scalar(64);
3971   LLT S1 = LLT::scalar(1);
3972 
3973   auto One = B.buildFConstant(S64, 1.0);
3974 
3975   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3976     .addUse(LHS)
3977     .addUse(RHS)
3978     .addImm(0)
3979     .setMIFlags(Flags);
3980 
3981   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3982 
3983   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3984     .addUse(DivScale0.getReg(0))
3985     .setMIFlags(Flags);
3986 
3987   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3988   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3989   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3990 
3991   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3992     .addUse(LHS)
3993     .addUse(RHS)
3994     .addImm(1)
3995     .setMIFlags(Flags);
3996 
3997   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3998   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3999   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4000 
4001   Register Scale;
4002   if (!ST.hasUsableDivScaleConditionOutput()) {
4003     // Workaround a hardware bug on SI where the condition output from div_scale
4004     // is not usable.
4005 
4006     LLT S32 = LLT::scalar(32);
4007 
4008     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4009     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4010     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4011     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4012 
4013     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4014                               Scale1Unmerge.getReg(1));
4015     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4016                               Scale0Unmerge.getReg(1));
4017     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4018   } else {
4019     Scale = DivScale1.getReg(1);
4020   }
4021 
4022   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4023     .addUse(Fma4.getReg(0))
4024     .addUse(Fma3.getReg(0))
4025     .addUse(Mul.getReg(0))
4026     .addUse(Scale)
4027     .setMIFlags(Flags);
4028 
4029   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4030     .addUse(Fmas.getReg(0))
4031     .addUse(RHS)
4032     .addUse(LHS)
4033     .setMIFlags(Flags);
4034 
4035   MI.eraseFromParent();
4036   return true;
4037 }
4038 
4039 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4040                                                  MachineRegisterInfo &MRI,
4041                                                  MachineIRBuilder &B) const {
4042   Register Res = MI.getOperand(0).getReg();
4043   Register LHS = MI.getOperand(2).getReg();
4044   Register RHS = MI.getOperand(3).getReg();
4045   uint16_t Flags = MI.getFlags();
4046 
4047   LLT S32 = LLT::scalar(32);
4048   LLT S1 = LLT::scalar(1);
4049 
4050   auto Abs = B.buildFAbs(S32, RHS, Flags);
4051   const APFloat C0Val(1.0f);
4052 
4053   auto C0 = B.buildConstant(S32, 0x6f800000);
4054   auto C1 = B.buildConstant(S32, 0x2f800000);
4055   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4056 
4057   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4058   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4059 
4060   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4061 
4062   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4063     .addUse(Mul0.getReg(0))
4064     .setMIFlags(Flags);
4065 
4066   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4067 
4068   B.buildFMul(Res, Sel, Mul1, Flags);
4069 
4070   MI.eraseFromParent();
4071   return true;
4072 }
4073 
4074 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4075 // FIXME: Why do we handle this one but not other removed instructions?
4076 //
4077 // Reciprocal square root.  The clamp prevents infinite results, clamping
4078 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4079 // +-max_float.
4080 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4081                                                     MachineRegisterInfo &MRI,
4082                                                     MachineIRBuilder &B) const {
4083   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4084     return true;
4085 
4086   Register Dst = MI.getOperand(0).getReg();
4087   Register Src = MI.getOperand(2).getReg();
4088   auto Flags = MI.getFlags();
4089 
4090   LLT Ty = MRI.getType(Dst);
4091 
4092   const fltSemantics *FltSemantics;
4093   if (Ty == LLT::scalar(32))
4094     FltSemantics = &APFloat::IEEEsingle();
4095   else if (Ty == LLT::scalar(64))
4096     FltSemantics = &APFloat::IEEEdouble();
4097   else
4098     return false;
4099 
4100   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4101     .addUse(Src)
4102     .setMIFlags(Flags);
4103 
4104   // We don't need to concern ourselves with the snan handling difference, since
4105   // the rsq quieted (or not) so use the one which will directly select.
4106   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4107   const bool UseIEEE = MFI->getMode().IEEE;
4108 
4109   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4110   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4111                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4112 
4113   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4114 
4115   if (UseIEEE)
4116     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4117   else
4118     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4119   MI.eraseFromParent();
4120   return true;
4121 }
4122 
4123 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4124   switch (IID) {
4125   case Intrinsic::amdgcn_ds_fadd:
4126     return AMDGPU::G_ATOMICRMW_FADD;
4127   case Intrinsic::amdgcn_ds_fmin:
4128     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4129   case Intrinsic::amdgcn_ds_fmax:
4130     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4131   default:
4132     llvm_unreachable("not a DS FP intrinsic");
4133   }
4134 }
4135 
4136 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4137                                                       MachineInstr &MI,
4138                                                       Intrinsic::ID IID) const {
4139   GISelChangeObserver &Observer = Helper.Observer;
4140   Observer.changingInstr(MI);
4141 
4142   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4143 
4144   // The remaining operands were used to set fields in the MemOperand on
4145   // construction.
4146   for (int I = 6; I > 3; --I)
4147     MI.removeOperand(I);
4148 
4149   MI.removeOperand(1); // Remove the intrinsic ID.
4150   Observer.changedInstr(MI);
4151   return true;
4152 }
4153 
4154 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4155                                             MachineRegisterInfo &MRI,
4156                                             MachineIRBuilder &B) const {
4157   uint64_t Offset =
4158     ST.getTargetLowering()->getImplicitParameterOffset(
4159       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4160   LLT DstTy = MRI.getType(DstReg);
4161   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4162 
4163   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4164   if (!loadInputValue(KernargPtrReg, B,
4165                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4166     return false;
4167 
4168   // FIXME: This should be nuw
4169   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4170   return true;
4171 }
4172 
4173 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4174                                                  MachineRegisterInfo &MRI,
4175                                                  MachineIRBuilder &B) const {
4176   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4177   if (!MFI->isEntryFunction()) {
4178     return legalizePreloadedArgIntrin(MI, MRI, B,
4179                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4180   }
4181 
4182   Register DstReg = MI.getOperand(0).getReg();
4183   if (!getImplicitArgPtr(DstReg, MRI, B))
4184     return false;
4185 
4186   MI.eraseFromParent();
4187   return true;
4188 }
4189 
4190 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4191                                               MachineRegisterInfo &MRI,
4192                                               MachineIRBuilder &B,
4193                                               unsigned AddrSpace) const {
4194   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4195   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4196   Register Hi32 = Unmerge.getReg(1);
4197 
4198   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4199   MI.eraseFromParent();
4200   return true;
4201 }
4202 
4203 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4204 // offset (the offset that is included in bounds checking and swizzling, to be
4205 // split between the instruction's voffset and immoffset fields) and soffset
4206 // (the offset that is excluded from bounds checking and swizzling, to go in
4207 // the instruction's soffset field).  This function takes the first kind of
4208 // offset and figures out how to split it between voffset and immoffset.
4209 std::pair<Register, unsigned>
4210 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4211                                         Register OrigOffset) const {
4212   const unsigned MaxImm = 4095;
4213   Register BaseReg;
4214   unsigned ImmOffset;
4215   const LLT S32 = LLT::scalar(32);
4216   MachineRegisterInfo &MRI = *B.getMRI();
4217 
4218   std::tie(BaseReg, ImmOffset) =
4219       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4220 
4221   // If BaseReg is a pointer, convert it to int.
4222   if (MRI.getType(BaseReg).isPointer())
4223     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4224 
4225   // If the immediate value is too big for the immoffset field, put the value
4226   // and -4096 into the immoffset field so that the value that is copied/added
4227   // for the voffset field is a multiple of 4096, and it stands more chance
4228   // of being CSEd with the copy/add for another similar load/store.
4229   // However, do not do that rounding down to a multiple of 4096 if that is a
4230   // negative number, as it appears to be illegal to have a negative offset
4231   // in the vgpr, even if adding the immediate offset makes it positive.
4232   unsigned Overflow = ImmOffset & ~MaxImm;
4233   ImmOffset -= Overflow;
4234   if ((int32_t)Overflow < 0) {
4235     Overflow += ImmOffset;
4236     ImmOffset = 0;
4237   }
4238 
4239   if (Overflow != 0) {
4240     if (!BaseReg) {
4241       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4242     } else {
4243       auto OverflowVal = B.buildConstant(S32, Overflow);
4244       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4245     }
4246   }
4247 
4248   if (!BaseReg)
4249     BaseReg = B.buildConstant(S32, 0).getReg(0);
4250 
4251   return std::make_pair(BaseReg, ImmOffset);
4252 }
4253 
4254 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4255 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4256                                           Register VOffset, Register SOffset,
4257                                           unsigned ImmOffset, Register VIndex,
4258                                           MachineRegisterInfo &MRI) const {
4259   Optional<ValueAndVReg> MaybeVOffsetVal =
4260       getIConstantVRegValWithLookThrough(VOffset, MRI);
4261   Optional<ValueAndVReg> MaybeSOffsetVal =
4262       getIConstantVRegValWithLookThrough(SOffset, MRI);
4263   Optional<ValueAndVReg> MaybeVIndexVal =
4264       getIConstantVRegValWithLookThrough(VIndex, MRI);
4265   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4266   // update the MMO with that offset. The stride is unknown so we can only do
4267   // this if VIndex is constant 0.
4268   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4269       MaybeVIndexVal->Value == 0) {
4270     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4271                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4272     MMO->setOffset(TotalOffset);
4273   } else {
4274     // We don't have a constant combined offset to use in the MMO. Give up.
4275     MMO->setValue((Value *)nullptr);
4276   }
4277 }
4278 
4279 /// Handle register layout difference for f16 images for some subtargets.
4280 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4281                                              MachineRegisterInfo &MRI,
4282                                              Register Reg,
4283                                              bool ImageStore) const {
4284   const LLT S16 = LLT::scalar(16);
4285   const LLT S32 = LLT::scalar(32);
4286   LLT StoreVT = MRI.getType(Reg);
4287   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4288 
4289   if (ST.hasUnpackedD16VMem()) {
4290     auto Unmerge = B.buildUnmerge(S16, Reg);
4291 
4292     SmallVector<Register, 4> WideRegs;
4293     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4294       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4295 
4296     int NumElts = StoreVT.getNumElements();
4297 
4298     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4299         .getReg(0);
4300   }
4301 
4302   if (ImageStore && ST.hasImageStoreD16Bug()) {
4303     if (StoreVT.getNumElements() == 2) {
4304       SmallVector<Register, 4> PackedRegs;
4305       Reg = B.buildBitcast(S32, Reg).getReg(0);
4306       PackedRegs.push_back(Reg);
4307       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4308       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4309           .getReg(0);
4310     }
4311 
4312     if (StoreVT.getNumElements() == 3) {
4313       SmallVector<Register, 4> PackedRegs;
4314       auto Unmerge = B.buildUnmerge(S16, Reg);
4315       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4316         PackedRegs.push_back(Unmerge.getReg(I));
4317       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4318       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4319       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4320     }
4321 
4322     if (StoreVT.getNumElements() == 4) {
4323       SmallVector<Register, 4> PackedRegs;
4324       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4325       auto Unmerge = B.buildUnmerge(S32, Reg);
4326       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4327         PackedRegs.push_back(Unmerge.getReg(I));
4328       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4329       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4330           .getReg(0);
4331     }
4332 
4333     llvm_unreachable("invalid data type");
4334   }
4335 
4336   if (StoreVT == LLT::fixed_vector(3, S16)) {
4337     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4338               .getReg(0);
4339   }
4340   return Reg;
4341 }
4342 
4343 Register AMDGPULegalizerInfo::fixStoreSourceType(
4344   MachineIRBuilder &B, Register VData, bool IsFormat) const {
4345   MachineRegisterInfo *MRI = B.getMRI();
4346   LLT Ty = MRI->getType(VData);
4347 
4348   const LLT S16 = LLT::scalar(16);
4349 
4350   // Fixup illegal register types for i8 stores.
4351   if (Ty == LLT::scalar(8) || Ty == S16) {
4352     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4353     return AnyExt;
4354   }
4355 
4356   if (Ty.isVector()) {
4357     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4358       if (IsFormat)
4359         return handleD16VData(B, *MRI, VData);
4360     }
4361   }
4362 
4363   return VData;
4364 }
4365 
4366 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4367                                               MachineRegisterInfo &MRI,
4368                                               MachineIRBuilder &B,
4369                                               bool IsTyped,
4370                                               bool IsFormat) const {
4371   Register VData = MI.getOperand(1).getReg();
4372   LLT Ty = MRI.getType(VData);
4373   LLT EltTy = Ty.getScalarType();
4374   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4375   const LLT S32 = LLT::scalar(32);
4376 
4377   VData = fixStoreSourceType(B, VData, IsFormat);
4378   Register RSrc = MI.getOperand(2).getReg();
4379 
4380   MachineMemOperand *MMO = *MI.memoperands_begin();
4381   const int MemSize = MMO->getSize();
4382 
4383   unsigned ImmOffset;
4384 
4385   // The typed intrinsics add an immediate after the registers.
4386   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4387 
4388   // The struct intrinsic variants add one additional operand over raw.
4389   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4390   Register VIndex;
4391   int OpOffset = 0;
4392   if (HasVIndex) {
4393     VIndex = MI.getOperand(3).getReg();
4394     OpOffset = 1;
4395   } else {
4396     VIndex = B.buildConstant(S32, 0).getReg(0);
4397   }
4398 
4399   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4400   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4401 
4402   unsigned Format = 0;
4403   if (IsTyped) {
4404     Format = MI.getOperand(5 + OpOffset).getImm();
4405     ++OpOffset;
4406   }
4407 
4408   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4409 
4410   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4411   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4412 
4413   unsigned Opc;
4414   if (IsTyped) {
4415     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4416                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4417   } else if (IsFormat) {
4418     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4419                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4420   } else {
4421     switch (MemSize) {
4422     case 1:
4423       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4424       break;
4425     case 2:
4426       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4427       break;
4428     default:
4429       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4430       break;
4431     }
4432   }
4433 
4434   auto MIB = B.buildInstr(Opc)
4435     .addUse(VData)              // vdata
4436     .addUse(RSrc)               // rsrc
4437     .addUse(VIndex)             // vindex
4438     .addUse(VOffset)            // voffset
4439     .addUse(SOffset)            // soffset
4440     .addImm(ImmOffset);         // offset(imm)
4441 
4442   if (IsTyped)
4443     MIB.addImm(Format);
4444 
4445   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4446      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4447      .addMemOperand(MMO);
4448 
4449   MI.eraseFromParent();
4450   return true;
4451 }
4452 
4453 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4454                                              MachineRegisterInfo &MRI,
4455                                              MachineIRBuilder &B,
4456                                              bool IsFormat,
4457                                              bool IsTyped) const {
4458   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4459   MachineMemOperand *MMO = *MI.memoperands_begin();
4460   const LLT MemTy = MMO->getMemoryType();
4461   const LLT S32 = LLT::scalar(32);
4462 
4463   Register Dst = MI.getOperand(0).getReg();
4464   Register RSrc = MI.getOperand(2).getReg();
4465 
4466   // The typed intrinsics add an immediate after the registers.
4467   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4468 
4469   // The struct intrinsic variants add one additional operand over raw.
4470   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4471   Register VIndex;
4472   int OpOffset = 0;
4473   if (HasVIndex) {
4474     VIndex = MI.getOperand(3).getReg();
4475     OpOffset = 1;
4476   } else {
4477     VIndex = B.buildConstant(S32, 0).getReg(0);
4478   }
4479 
4480   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4481   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4482 
4483   unsigned Format = 0;
4484   if (IsTyped) {
4485     Format = MI.getOperand(5 + OpOffset).getImm();
4486     ++OpOffset;
4487   }
4488 
4489   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4490   unsigned ImmOffset;
4491 
4492   LLT Ty = MRI.getType(Dst);
4493   LLT EltTy = Ty.getScalarType();
4494   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4495   const bool Unpacked = ST.hasUnpackedD16VMem();
4496 
4497   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4498   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4499 
4500   unsigned Opc;
4501 
4502   if (IsTyped) {
4503     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4504                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4505   } else if (IsFormat) {
4506     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4507                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4508   } else {
4509     switch (MemTy.getSizeInBits()) {
4510     case 8:
4511       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4512       break;
4513     case 16:
4514       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4515       break;
4516     default:
4517       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4518       break;
4519     }
4520   }
4521 
4522   Register LoadDstReg;
4523 
4524   bool IsExtLoad =
4525       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4526   LLT UnpackedTy = Ty.changeElementSize(32);
4527 
4528   if (IsExtLoad)
4529     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4530   else if (Unpacked && IsD16 && Ty.isVector())
4531     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4532   else
4533     LoadDstReg = Dst;
4534 
4535   auto MIB = B.buildInstr(Opc)
4536     .addDef(LoadDstReg)         // vdata
4537     .addUse(RSrc)               // rsrc
4538     .addUse(VIndex)             // vindex
4539     .addUse(VOffset)            // voffset
4540     .addUse(SOffset)            // soffset
4541     .addImm(ImmOffset);         // offset(imm)
4542 
4543   if (IsTyped)
4544     MIB.addImm(Format);
4545 
4546   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4547      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4548      .addMemOperand(MMO);
4549 
4550   if (LoadDstReg != Dst) {
4551     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4552 
4553     // Widen result for extending loads was widened.
4554     if (IsExtLoad)
4555       B.buildTrunc(Dst, LoadDstReg);
4556     else {
4557       // Repack to original 16-bit vector result
4558       // FIXME: G_TRUNC should work, but legalization currently fails
4559       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4560       SmallVector<Register, 4> Repack;
4561       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4562         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4563       B.buildMerge(Dst, Repack);
4564     }
4565   }
4566 
4567   MI.eraseFromParent();
4568   return true;
4569 }
4570 
4571 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4572                                                MachineIRBuilder &B,
4573                                                bool IsInc) const {
4574   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4575                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4576   B.buildInstr(Opc)
4577     .addDef(MI.getOperand(0).getReg())
4578     .addUse(MI.getOperand(2).getReg())
4579     .addUse(MI.getOperand(3).getReg())
4580     .cloneMemRefs(MI);
4581   MI.eraseFromParent();
4582   return true;
4583 }
4584 
4585 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4586   switch (IntrID) {
4587   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4588   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4589     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4590   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4591   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4592     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4593   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4594   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4595     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4596   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4597   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4598     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4599   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4600   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4601     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4602   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4603   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4604     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4605   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4606   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4607     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4608   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4609   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4610     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4611   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4612   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4613     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4614   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4615   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4616     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4617   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4618   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4619     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4620   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4621   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4622     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4623   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4624   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4625     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4626   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4627   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4628     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4629   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4630   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4631     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4632   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4633   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4634     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4635   default:
4636     llvm_unreachable("unhandled atomic opcode");
4637   }
4638 }
4639 
4640 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4641                                                MachineIRBuilder &B,
4642                                                Intrinsic::ID IID) const {
4643   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4644                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4645   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4646 
4647   Register Dst;
4648 
4649   int OpOffset = 0;
4650   if (HasReturn) {
4651     // A few FP atomics do not support return values.
4652     Dst = MI.getOperand(0).getReg();
4653   } else {
4654     OpOffset = -1;
4655   }
4656 
4657   Register VData = MI.getOperand(2 + OpOffset).getReg();
4658   Register CmpVal;
4659 
4660   if (IsCmpSwap) {
4661     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4662     ++OpOffset;
4663   }
4664 
4665   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4666   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4667 
4668   // The struct intrinsic variants add one additional operand over raw.
4669   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4670   Register VIndex;
4671   if (HasVIndex) {
4672     VIndex = MI.getOperand(4 + OpOffset).getReg();
4673     ++OpOffset;
4674   } else {
4675     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4676   }
4677 
4678   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4679   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4680   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4681 
4682   MachineMemOperand *MMO = *MI.memoperands_begin();
4683 
4684   unsigned ImmOffset;
4685   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4686   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4687 
4688   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4689 
4690   if (HasReturn)
4691     MIB.addDef(Dst);
4692 
4693   MIB.addUse(VData); // vdata
4694 
4695   if (IsCmpSwap)
4696     MIB.addReg(CmpVal);
4697 
4698   MIB.addUse(RSrc)               // rsrc
4699      .addUse(VIndex)             // vindex
4700      .addUse(VOffset)            // voffset
4701      .addUse(SOffset)            // soffset
4702      .addImm(ImmOffset)          // offset(imm)
4703      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4704      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4705      .addMemOperand(MMO);
4706 
4707   MI.eraseFromParent();
4708   return true;
4709 }
4710 
4711 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4712 /// vector with s16 typed elements.
4713 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4714                                       SmallVectorImpl<Register> &PackedAddrs,
4715                                       unsigned ArgOffset,
4716                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4717                                       bool IsA16, bool IsG16) {
4718   const LLT S16 = LLT::scalar(16);
4719   const LLT V2S16 = LLT::fixed_vector(2, 16);
4720   auto EndIdx = Intr->VAddrEnd;
4721 
4722   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4723     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4724     if (!SrcOp.isReg())
4725       continue; // _L to _LZ may have eliminated this.
4726 
4727     Register AddrReg = SrcOp.getReg();
4728 
4729     if ((I < Intr->GradientStart) ||
4730         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4731         (I >= Intr->CoordStart && !IsA16)) {
4732       if ((I < Intr->GradientStart) && IsA16 &&
4733           (B.getMRI()->getType(AddrReg) == S16)) {
4734         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4735         // Special handling of bias when A16 is on. Bias is of type half but
4736         // occupies full 32-bit.
4737         PackedAddrs.push_back(
4738             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4739                 .getReg(0));
4740       } else {
4741         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4742                "Bias needs to be converted to 16 bit in A16 mode");
4743         // Handle any gradient or coordinate operands that should not be packed
4744         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4745         PackedAddrs.push_back(AddrReg);
4746       }
4747     } else {
4748       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4749       // derivatives dx/dh and dx/dv are packed with undef.
4750       if (((I + 1) >= EndIdx) ||
4751           ((Intr->NumGradients / 2) % 2 == 1 &&
4752            (I == static_cast<unsigned>(Intr->GradientStart +
4753                                        (Intr->NumGradients / 2) - 1) ||
4754             I == static_cast<unsigned>(Intr->GradientStart +
4755                                        Intr->NumGradients - 1))) ||
4756           // Check for _L to _LZ optimization
4757           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4758         PackedAddrs.push_back(
4759             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4760                 .getReg(0));
4761       } else {
4762         PackedAddrs.push_back(
4763             B.buildBuildVector(
4764                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4765                 .getReg(0));
4766         ++I;
4767       }
4768     }
4769   }
4770 }
4771 
4772 /// Convert from separate vaddr components to a single vector address register,
4773 /// and replace the remaining operands with $noreg.
4774 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4775                                      int DimIdx, int NumVAddrs) {
4776   const LLT S32 = LLT::scalar(32);
4777 
4778   SmallVector<Register, 8> AddrRegs;
4779   for (int I = 0; I != NumVAddrs; ++I) {
4780     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4781     if (SrcOp.isReg()) {
4782       AddrRegs.push_back(SrcOp.getReg());
4783       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4784     }
4785   }
4786 
4787   int NumAddrRegs = AddrRegs.size();
4788   if (NumAddrRegs != 1) {
4789     // Above 8 elements round up to next power of 2 (i.e. 16).
4790     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4791       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4792       auto Undef = B.buildUndef(S32);
4793       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4794       NumAddrRegs = RoundedNumRegs;
4795     }
4796 
4797     auto VAddr =
4798         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4799     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4800   }
4801 
4802   for (int I = 1; I != NumVAddrs; ++I) {
4803     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4804     if (SrcOp.isReg())
4805       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4806   }
4807 }
4808 
4809 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4810 ///
4811 /// Depending on the subtarget, load/store with 16-bit element data need to be
4812 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4813 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4814 /// registers.
4815 ///
4816 /// We don't want to directly select image instructions just yet, but also want
4817 /// to exposes all register repacking to the legalizer/combiners. We also don't
4818 /// want a selected instruction entering RegBankSelect. In order to avoid
4819 /// defining a multitude of intermediate image instructions, directly hack on
4820 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4821 /// padding now unnecessary arguments with $noreg.
4822 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4823     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4824     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4825 
4826   const unsigned NumDefs = MI.getNumExplicitDefs();
4827   const unsigned ArgOffset = NumDefs + 1;
4828   bool IsTFE = NumDefs == 2;
4829   // We are only processing the operands of d16 image operations on subtargets
4830   // that use the unpacked register layout, or need to repack the TFE result.
4831 
4832   // TODO: Do we need to guard against already legalized intrinsics?
4833   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4834       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4835 
4836   MachineRegisterInfo *MRI = B.getMRI();
4837   const LLT S32 = LLT::scalar(32);
4838   const LLT S16 = LLT::scalar(16);
4839   const LLT V2S16 = LLT::fixed_vector(2, 16);
4840 
4841   unsigned DMask = 0;
4842   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4843   LLT Ty = MRI->getType(VData);
4844 
4845   // Check for 16 bit addresses and pack if true.
4846   LLT GradTy =
4847       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4848   LLT AddrTy =
4849       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4850   const bool IsG16 = GradTy == S16;
4851   const bool IsA16 = AddrTy == S16;
4852   const bool IsD16 = Ty.getScalarType() == S16;
4853 
4854   int DMaskLanes = 0;
4855   if (!BaseOpcode->Atomic) {
4856     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4857     if (BaseOpcode->Gather4) {
4858       DMaskLanes = 4;
4859     } else if (DMask != 0) {
4860       DMaskLanes = countPopulation(DMask);
4861     } else if (!IsTFE && !BaseOpcode->Store) {
4862       // If dmask is 0, this is a no-op load. This can be eliminated.
4863       B.buildUndef(MI.getOperand(0));
4864       MI.eraseFromParent();
4865       return true;
4866     }
4867   }
4868 
4869   Observer.changingInstr(MI);
4870   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4871 
4872   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4873                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4874   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4875                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4876   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4877 
4878   // Track that we legalized this
4879   MI.setDesc(B.getTII().get(NewOpcode));
4880 
4881   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4882   // dmask to be at least 1 otherwise the instruction will fail
4883   if (IsTFE && DMask == 0) {
4884     DMask = 0x1;
4885     DMaskLanes = 1;
4886     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4887   }
4888 
4889   if (BaseOpcode->Atomic) {
4890     Register VData0 = MI.getOperand(2).getReg();
4891     LLT Ty = MRI->getType(VData0);
4892 
4893     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4894     if (Ty.isVector())
4895       return false;
4896 
4897     if (BaseOpcode->AtomicX2) {
4898       Register VData1 = MI.getOperand(3).getReg();
4899       // The two values are packed in one register.
4900       LLT PackedTy = LLT::fixed_vector(2, Ty);
4901       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4902       MI.getOperand(2).setReg(Concat.getReg(0));
4903       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4904     }
4905   }
4906 
4907   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4908 
4909   // Rewrite the addressing register layout before doing anything else.
4910   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4911     // 16 bit gradients are supported, but are tied to the A16 control
4912     // so both gradients and addresses must be 16 bit
4913     return false;
4914   }
4915 
4916   if (IsA16 && !ST.hasA16()) {
4917     // A16 not supported
4918     return false;
4919   }
4920 
4921   if (IsA16 || IsG16) {
4922     if (Intr->NumVAddrs > 1) {
4923       SmallVector<Register, 4> PackedRegs;
4924 
4925       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4926                                 IsG16);
4927 
4928       // See also below in the non-a16 branch
4929       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4930                           PackedRegs.size() <= ST.getNSAMaxSize();
4931 
4932       if (!UseNSA && PackedRegs.size() > 1) {
4933         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4934         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4935         PackedRegs[0] = Concat.getReg(0);
4936         PackedRegs.resize(1);
4937       }
4938 
4939       const unsigned NumPacked = PackedRegs.size();
4940       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4941         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4942         if (!SrcOp.isReg()) {
4943           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4944           continue;
4945         }
4946 
4947         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4948 
4949         if (I - Intr->VAddrStart < NumPacked)
4950           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4951         else
4952           SrcOp.setReg(AMDGPU::NoRegister);
4953       }
4954     }
4955   } else {
4956     // If the register allocator cannot place the address registers contiguously
4957     // without introducing moves, then using the non-sequential address encoding
4958     // is always preferable, since it saves VALU instructions and is usually a
4959     // wash in terms of code size or even better.
4960     //
4961     // However, we currently have no way of hinting to the register allocator
4962     // that MIMG addresses should be placed contiguously when it is possible to
4963     // do so, so force non-NSA for the common 2-address case as a heuristic.
4964     //
4965     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4966     // allocation when possible.
4967     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4968                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4969 
4970     if (!UseNSA && Intr->NumVAddrs > 1)
4971       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4972                                Intr->NumVAddrs);
4973   }
4974 
4975   int Flags = 0;
4976   if (IsA16)
4977     Flags |= 1;
4978   if (IsG16)
4979     Flags |= 2;
4980   MI.addOperand(MachineOperand::CreateImm(Flags));
4981 
4982   if (BaseOpcode->Store) { // No TFE for stores?
4983     // TODO: Handle dmask trim
4984     if (!Ty.isVector() || !IsD16)
4985       return true;
4986 
4987     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4988     if (RepackedReg != VData) {
4989       MI.getOperand(1).setReg(RepackedReg);
4990     }
4991 
4992     return true;
4993   }
4994 
4995   Register DstReg = MI.getOperand(0).getReg();
4996   const LLT EltTy = Ty.getScalarType();
4997   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4998 
4999   // Confirm that the return type is large enough for the dmask specified
5000   if (NumElts < DMaskLanes)
5001     return false;
5002 
5003   if (NumElts > 4 || DMaskLanes > 4)
5004     return false;
5005 
5006   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5007   const LLT AdjustedTy =
5008       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5009 
5010   // The raw dword aligned data component of the load. The only legal cases
5011   // where this matters should be when using the packed D16 format, for
5012   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5013   LLT RoundedTy;
5014 
5015   // S32 vector to to cover all data, plus TFE result element.
5016   LLT TFETy;
5017 
5018   // Register type to use for each loaded component. Will be S32 or V2S16.
5019   LLT RegTy;
5020 
5021   if (IsD16 && ST.hasUnpackedD16VMem()) {
5022     RoundedTy =
5023         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5024     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5025     RegTy = S32;
5026   } else {
5027     unsigned EltSize = EltTy.getSizeInBits();
5028     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5029     unsigned RoundedSize = 32 * RoundedElts;
5030     RoundedTy = LLT::scalarOrVector(
5031         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5032     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5033     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5034   }
5035 
5036   // The return type does not need adjustment.
5037   // TODO: Should we change s16 case to s32 or <2 x s16>?
5038   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5039     return true;
5040 
5041   Register Dst1Reg;
5042 
5043   // Insert after the instruction.
5044   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5045 
5046   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5047   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5048   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5049   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5050 
5051   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5052 
5053   MI.getOperand(0).setReg(NewResultReg);
5054 
5055   // In the IR, TFE is supposed to be used with a 2 element struct return
5056   // type. The instruction really returns these two values in one contiguous
5057   // register, with one additional dword beyond the loaded data. Rewrite the
5058   // return type to use a single register result.
5059 
5060   if (IsTFE) {
5061     Dst1Reg = MI.getOperand(1).getReg();
5062     if (MRI->getType(Dst1Reg) != S32)
5063       return false;
5064 
5065     // TODO: Make sure the TFE operand bit is set.
5066     MI.removeOperand(1);
5067 
5068     // Handle the easy case that requires no repack instructions.
5069     if (Ty == S32) {
5070       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5071       return true;
5072     }
5073   }
5074 
5075   // Now figure out how to copy the new result register back into the old
5076   // result.
5077   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5078 
5079   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
5080 
5081   if (ResultNumRegs == 1) {
5082     assert(!IsTFE);
5083     ResultRegs[0] = NewResultReg;
5084   } else {
5085     // We have to repack into a new vector of some kind.
5086     for (int I = 0; I != NumDataRegs; ++I)
5087       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5088     B.buildUnmerge(ResultRegs, NewResultReg);
5089 
5090     // Drop the final TFE element to get the data part. The TFE result is
5091     // directly written to the right place already.
5092     if (IsTFE)
5093       ResultRegs.resize(NumDataRegs);
5094   }
5095 
5096   // For an s16 scalar result, we form an s32 result with a truncate regardless
5097   // of packed vs. unpacked.
5098   if (IsD16 && !Ty.isVector()) {
5099     B.buildTrunc(DstReg, ResultRegs[0]);
5100     return true;
5101   }
5102 
5103   // Avoid a build/concat_vector of 1 entry.
5104   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5105     B.buildBitcast(DstReg, ResultRegs[0]);
5106     return true;
5107   }
5108 
5109   assert(Ty.isVector());
5110 
5111   if (IsD16) {
5112     // For packed D16 results with TFE enabled, all the data components are
5113     // S32. Cast back to the expected type.
5114     //
5115     // TODO: We don't really need to use load s32 elements. We would only need one
5116     // cast for the TFE result if a multiple of v2s16 was used.
5117     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5118       for (Register &Reg : ResultRegs)
5119         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5120     } else if (ST.hasUnpackedD16VMem()) {
5121       for (Register &Reg : ResultRegs)
5122         Reg = B.buildTrunc(S16, Reg).getReg(0);
5123     }
5124   }
5125 
5126   auto padWithUndef = [&](LLT Ty, int NumElts) {
5127     if (NumElts == 0)
5128       return;
5129     Register Undef = B.buildUndef(Ty).getReg(0);
5130     for (int I = 0; I != NumElts; ++I)
5131       ResultRegs.push_back(Undef);
5132   };
5133 
5134   // Pad out any elements eliminated due to the dmask.
5135   LLT ResTy = MRI->getType(ResultRegs[0]);
5136   if (!ResTy.isVector()) {
5137     padWithUndef(ResTy, NumElts - ResultRegs.size());
5138     B.buildBuildVector(DstReg, ResultRegs);
5139     return true;
5140   }
5141 
5142   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5143   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5144 
5145   // Deal with the one annoying legal case.
5146   const LLT V3S16 = LLT::fixed_vector(3, 16);
5147   if (Ty == V3S16) {
5148     if (IsTFE) {
5149       if (ResultRegs.size() == 1) {
5150         NewResultReg = ResultRegs[0];
5151       } else if (ResultRegs.size() == 2) {
5152         LLT V4S16 = LLT::fixed_vector(4, 16);
5153         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5154       } else {
5155         return false;
5156       }
5157     }
5158 
5159     if (MRI->getType(DstReg).getNumElements() <
5160         MRI->getType(NewResultReg).getNumElements()) {
5161       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5162     } else {
5163       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5164     }
5165     return true;
5166   }
5167 
5168   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5169   B.buildConcatVectors(DstReg, ResultRegs);
5170   return true;
5171 }
5172 
5173 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5174   LegalizerHelper &Helper, MachineInstr &MI) const {
5175   MachineIRBuilder &B = Helper.MIRBuilder;
5176   GISelChangeObserver &Observer = Helper.Observer;
5177 
5178   Register Dst = MI.getOperand(0).getReg();
5179   LLT Ty = B.getMRI()->getType(Dst);
5180   unsigned Size = Ty.getSizeInBits();
5181   MachineFunction &MF = B.getMF();
5182 
5183   Observer.changingInstr(MI);
5184 
5185   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5186     Ty = getBitcastRegisterType(Ty);
5187     Helper.bitcastDst(MI, Ty, 0);
5188     Dst = MI.getOperand(0).getReg();
5189     B.setInsertPt(B.getMBB(), MI);
5190   }
5191 
5192   // FIXME: We don't really need this intermediate instruction. The intrinsic
5193   // should be fixed to have a memory operand. Since it's readnone, we're not
5194   // allowed to add one.
5195   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5196   MI.removeOperand(1); // Remove intrinsic ID
5197 
5198   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5199   // TODO: Should this use datalayout alignment?
5200   const unsigned MemSize = (Size + 7) / 8;
5201   const Align MemAlign(4);
5202   MachineMemOperand *MMO = MF.getMachineMemOperand(
5203       MachinePointerInfo(),
5204       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5205           MachineMemOperand::MOInvariant,
5206       MemSize, MemAlign);
5207   MI.addMemOperand(MF, MMO);
5208 
5209   // There are no 96-bit result scalar loads, but widening to 128-bit should
5210   // always be legal. We may need to restore this to a 96-bit result if it turns
5211   // out this needs to be converted to a vector load during RegBankSelect.
5212   if (!isPowerOf2_32(Size)) {
5213     if (Ty.isVector())
5214       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5215     else
5216       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5217   }
5218 
5219   Observer.changedInstr(MI);
5220   return true;
5221 }
5222 
5223 // TODO: Move to selection
5224 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5225                                                 MachineRegisterInfo &MRI,
5226                                                 MachineIRBuilder &B) const {
5227   if (!ST.isTrapHandlerEnabled() ||
5228       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5229     return legalizeTrapEndpgm(MI, MRI, B);
5230 
5231   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5232     switch (*HsaAbiVer) {
5233     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5234     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5235       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5236     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5237     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5238       return ST.supportsGetDoorbellID() ?
5239           legalizeTrapHsa(MI, MRI, B) :
5240           legalizeTrapHsaQueuePtr(MI, MRI, B);
5241     }
5242   }
5243 
5244   llvm_unreachable("Unknown trap handler");
5245 }
5246 
5247 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5248     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5249   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5250   MI.eraseFromParent();
5251   return true;
5252 }
5253 
5254 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5255     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5256   MachineFunction &MF = B.getMF();
5257   const LLT S64 = LLT::scalar(64);
5258 
5259   Register SGPR01(AMDGPU::SGPR0_SGPR1);
5260   // For code object version 5, queue_ptr is passed through implicit kernarg.
5261   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5262     AMDGPUTargetLowering::ImplicitParameter Param =
5263         AMDGPUTargetLowering::QUEUE_PTR;
5264     uint64_t Offset =
5265         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5266 
5267     Register KernargPtrReg = MRI.createGenericVirtualRegister(
5268         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5269 
5270     if (!loadInputValue(KernargPtrReg, B,
5271                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5272       return false;
5273 
5274     // TODO: can we be smarter about machine pointer info?
5275     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5276     MachineMemOperand *MMO = MF.getMachineMemOperand(
5277         PtrInfo,
5278         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5279             MachineMemOperand::MOInvariant,
5280         LLT::scalar(64), commonAlignment(Align(64), Offset));
5281 
5282     // Pointer address
5283     Register LoadAddr = MRI.createGenericVirtualRegister(
5284         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5285     B.buildPtrAdd(LoadAddr, KernargPtrReg,
5286                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5287     // Load address
5288     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5289     B.buildCopy(SGPR01, Temp);
5290     B.buildInstr(AMDGPU::S_TRAP)
5291         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5292         .addReg(SGPR01, RegState::Implicit);
5293     MI.eraseFromParent();
5294     return true;
5295   }
5296 
5297   // Pass queue pointer to trap handler as input, and insert trap instruction
5298   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5299   Register LiveIn =
5300     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5301   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5302     return false;
5303 
5304   B.buildCopy(SGPR01, LiveIn);
5305   B.buildInstr(AMDGPU::S_TRAP)
5306       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5307       .addReg(SGPR01, RegState::Implicit);
5308 
5309   MI.eraseFromParent();
5310   return true;
5311 }
5312 
5313 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5314     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5315   B.buildInstr(AMDGPU::S_TRAP)
5316       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5317   MI.eraseFromParent();
5318   return true;
5319 }
5320 
5321 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5322     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5323   // Is non-HSA path or trap-handler disabled? Then, report a warning
5324   // accordingly
5325   if (!ST.isTrapHandlerEnabled() ||
5326       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5327     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5328                                      "debugtrap handler not supported",
5329                                      MI.getDebugLoc(), DS_Warning);
5330     LLVMContext &Ctx = B.getMF().getFunction().getContext();
5331     Ctx.diagnose(NoTrap);
5332   } else {
5333     // Insert debug-trap instruction
5334     B.buildInstr(AMDGPU::S_TRAP)
5335         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5336   }
5337 
5338   MI.eraseFromParent();
5339   return true;
5340 }
5341 
5342 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5343                                                MachineIRBuilder &B) const {
5344   MachineRegisterInfo &MRI = *B.getMRI();
5345   const LLT S16 = LLT::scalar(16);
5346   const LLT S32 = LLT::scalar(32);
5347 
5348   Register DstReg = MI.getOperand(0).getReg();
5349   Register NodePtr = MI.getOperand(2).getReg();
5350   Register RayExtent = MI.getOperand(3).getReg();
5351   Register RayOrigin = MI.getOperand(4).getReg();
5352   Register RayDir = MI.getOperand(5).getReg();
5353   Register RayInvDir = MI.getOperand(6).getReg();
5354   Register TDescr = MI.getOperand(7).getReg();
5355 
5356   if (!ST.hasGFX10_AEncoding()) {
5357     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5358                                         "intrinsic not supported on subtarget",
5359                                         MI.getDebugLoc());
5360     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5361     return false;
5362   }
5363 
5364   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5365   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5366   const unsigned NumVDataDwords = 4;
5367   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5368   const bool UseNSA =
5369       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
5370   const unsigned BaseOpcodes[2][2] = {
5371       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5372       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5373        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5374   int Opcode;
5375   if (UseNSA) {
5376     Opcode =
5377         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
5378                               NumVDataDwords, NumVAddrDwords);
5379   } else {
5380     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5381                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
5382                                    PowerOf2Ceil(NumVAddrDwords));
5383   }
5384   assert(Opcode != -1);
5385 
5386   SmallVector<Register, 12> Ops;
5387   if (Is64) {
5388     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5389     Ops.push_back(Unmerge.getReg(0));
5390     Ops.push_back(Unmerge.getReg(1));
5391   } else {
5392     Ops.push_back(NodePtr);
5393   }
5394   Ops.push_back(RayExtent);
5395 
5396   auto packLanes = [&Ops, &S32, &B](Register Src) {
5397     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5398     Ops.push_back(Unmerge.getReg(0));
5399     Ops.push_back(Unmerge.getReg(1));
5400     Ops.push_back(Unmerge.getReg(2));
5401   };
5402 
5403   packLanes(RayOrigin);
5404   if (IsA16) {
5405     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5406     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5407     Register R1 = MRI.createGenericVirtualRegister(S32);
5408     Register R2 = MRI.createGenericVirtualRegister(S32);
5409     Register R3 = MRI.createGenericVirtualRegister(S32);
5410     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5411     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5412     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5413     Ops.push_back(R1);
5414     Ops.push_back(R2);
5415     Ops.push_back(R3);
5416   } else {
5417     packLanes(RayDir);
5418     packLanes(RayInvDir);
5419   }
5420 
5421   if (!UseNSA) {
5422     // Build a single vector containing all the operands so far prepared.
5423     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5424     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5425     Ops.clear();
5426     Ops.push_back(MergedOps);
5427   }
5428 
5429   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5430     .addDef(DstReg)
5431     .addImm(Opcode);
5432 
5433   for (Register R : Ops) {
5434     MIB.addUse(R);
5435   }
5436 
5437   MIB.addUse(TDescr)
5438      .addImm(IsA16 ? 1 : 0)
5439      .cloneMemRefs(MI);
5440 
5441   MI.eraseFromParent();
5442   return true;
5443 }
5444 
5445 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5446                                                MachineIRBuilder &B) const {
5447   unsigned Opc;
5448   int RoundMode = MI.getOperand(2).getImm();
5449 
5450   if (RoundMode == (int)RoundingMode::TowardPositive)
5451     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5452   else if (RoundMode == (int)RoundingMode::TowardNegative)
5453     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5454   else
5455     return false;
5456 
5457   B.buildInstr(Opc)
5458       .addDef(MI.getOperand(0).getReg())
5459       .addUse(MI.getOperand(1).getReg());
5460 
5461   MI.eraseFromParent();
5462 
5463   return true;
5464 }
5465 
5466 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5467                                             MachineInstr &MI) const {
5468   MachineIRBuilder &B = Helper.MIRBuilder;
5469   MachineRegisterInfo &MRI = *B.getMRI();
5470 
5471   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5472   auto IntrID = MI.getIntrinsicID();
5473   switch (IntrID) {
5474   case Intrinsic::amdgcn_if:
5475   case Intrinsic::amdgcn_else: {
5476     MachineInstr *Br = nullptr;
5477     MachineBasicBlock *UncondBrTarget = nullptr;
5478     bool Negated = false;
5479     if (MachineInstr *BrCond =
5480             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5481       const SIRegisterInfo *TRI
5482         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5483 
5484       Register Def = MI.getOperand(1).getReg();
5485       Register Use = MI.getOperand(3).getReg();
5486 
5487       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5488 
5489       if (Negated)
5490         std::swap(CondBrTarget, UncondBrTarget);
5491 
5492       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5493       if (IntrID == Intrinsic::amdgcn_if) {
5494         B.buildInstr(AMDGPU::SI_IF)
5495           .addDef(Def)
5496           .addUse(Use)
5497           .addMBB(UncondBrTarget);
5498       } else {
5499         B.buildInstr(AMDGPU::SI_ELSE)
5500             .addDef(Def)
5501             .addUse(Use)
5502             .addMBB(UncondBrTarget);
5503       }
5504 
5505       if (Br) {
5506         Br->getOperand(0).setMBB(CondBrTarget);
5507       } else {
5508         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5509         // since we're swapping branch targets it needs to be reinserted.
5510         // FIXME: IRTranslator should probably not do this
5511         B.buildBr(*CondBrTarget);
5512       }
5513 
5514       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5515       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5516       MI.eraseFromParent();
5517       BrCond->eraseFromParent();
5518       return true;
5519     }
5520 
5521     return false;
5522   }
5523   case Intrinsic::amdgcn_loop: {
5524     MachineInstr *Br = nullptr;
5525     MachineBasicBlock *UncondBrTarget = nullptr;
5526     bool Negated = false;
5527     if (MachineInstr *BrCond =
5528             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5529       const SIRegisterInfo *TRI
5530         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5531 
5532       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5533       Register Reg = MI.getOperand(2).getReg();
5534 
5535       if (Negated)
5536         std::swap(CondBrTarget, UncondBrTarget);
5537 
5538       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5539       B.buildInstr(AMDGPU::SI_LOOP)
5540         .addUse(Reg)
5541         .addMBB(UncondBrTarget);
5542 
5543       if (Br)
5544         Br->getOperand(0).setMBB(CondBrTarget);
5545       else
5546         B.buildBr(*CondBrTarget);
5547 
5548       MI.eraseFromParent();
5549       BrCond->eraseFromParent();
5550       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5551       return true;
5552     }
5553 
5554     return false;
5555   }
5556   case Intrinsic::amdgcn_kernarg_segment_ptr:
5557     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5558       // This only makes sense to call in a kernel, so just lower to null.
5559       B.buildConstant(MI.getOperand(0).getReg(), 0);
5560       MI.eraseFromParent();
5561       return true;
5562     }
5563 
5564     return legalizePreloadedArgIntrin(
5565       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5566   case Intrinsic::amdgcn_implicitarg_ptr:
5567     return legalizeImplicitArgPtr(MI, MRI, B);
5568   case Intrinsic::amdgcn_workitem_id_x:
5569     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5570                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5571   case Intrinsic::amdgcn_workitem_id_y:
5572     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5573                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5574   case Intrinsic::amdgcn_workitem_id_z:
5575     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5576                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5577   case Intrinsic::amdgcn_workgroup_id_x:
5578     return legalizePreloadedArgIntrin(MI, MRI, B,
5579                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5580   case Intrinsic::amdgcn_workgroup_id_y:
5581     return legalizePreloadedArgIntrin(MI, MRI, B,
5582                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5583   case Intrinsic::amdgcn_workgroup_id_z:
5584     return legalizePreloadedArgIntrin(MI, MRI, B,
5585                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5586   case Intrinsic::amdgcn_dispatch_ptr:
5587     return legalizePreloadedArgIntrin(MI, MRI, B,
5588                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5589   case Intrinsic::amdgcn_queue_ptr:
5590     return legalizePreloadedArgIntrin(MI, MRI, B,
5591                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5592   case Intrinsic::amdgcn_implicit_buffer_ptr:
5593     return legalizePreloadedArgIntrin(
5594       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5595   case Intrinsic::amdgcn_dispatch_id:
5596     return legalizePreloadedArgIntrin(MI, MRI, B,
5597                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5598   case Intrinsic::r600_read_ngroups_x:
5599     // TODO: Emit error for hsa
5600     return legalizeKernargMemParameter(MI, B,
5601                                        SI::KernelInputOffsets::NGROUPS_X);
5602   case Intrinsic::r600_read_ngroups_y:
5603     return legalizeKernargMemParameter(MI, B,
5604                                        SI::KernelInputOffsets::NGROUPS_Y);
5605   case Intrinsic::r600_read_ngroups_z:
5606     return legalizeKernargMemParameter(MI, B,
5607                                        SI::KernelInputOffsets::NGROUPS_Z);
5608   case Intrinsic::r600_read_local_size_x:
5609     // TODO: Could insert G_ASSERT_ZEXT from s16
5610     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5611   case Intrinsic::r600_read_local_size_y:
5612     // TODO: Could insert G_ASSERT_ZEXT from s16
5613     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
5614     // TODO: Could insert G_ASSERT_ZEXT from s16
5615   case Intrinsic::r600_read_local_size_z:
5616     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5617   case Intrinsic::r600_read_global_size_x:
5618     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5619   case Intrinsic::r600_read_global_size_y:
5620     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5621   case Intrinsic::r600_read_global_size_z:
5622     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5623   case Intrinsic::amdgcn_fdiv_fast:
5624     return legalizeFDIVFastIntrin(MI, MRI, B);
5625   case Intrinsic::amdgcn_is_shared:
5626     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5627   case Intrinsic::amdgcn_is_private:
5628     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5629   case Intrinsic::amdgcn_wavefrontsize: {
5630     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5631     MI.eraseFromParent();
5632     return true;
5633   }
5634   case Intrinsic::amdgcn_s_buffer_load:
5635     return legalizeSBufferLoad(Helper, MI);
5636   case Intrinsic::amdgcn_raw_buffer_store:
5637   case Intrinsic::amdgcn_struct_buffer_store:
5638     return legalizeBufferStore(MI, MRI, B, false, false);
5639   case Intrinsic::amdgcn_raw_buffer_store_format:
5640   case Intrinsic::amdgcn_struct_buffer_store_format:
5641     return legalizeBufferStore(MI, MRI, B, false, true);
5642   case Intrinsic::amdgcn_raw_tbuffer_store:
5643   case Intrinsic::amdgcn_struct_tbuffer_store:
5644     return legalizeBufferStore(MI, MRI, B, true, true);
5645   case Intrinsic::amdgcn_raw_buffer_load:
5646   case Intrinsic::amdgcn_struct_buffer_load:
5647     return legalizeBufferLoad(MI, MRI, B, false, false);
5648   case Intrinsic::amdgcn_raw_buffer_load_format:
5649   case Intrinsic::amdgcn_struct_buffer_load_format:
5650     return legalizeBufferLoad(MI, MRI, B, true, false);
5651   case Intrinsic::amdgcn_raw_tbuffer_load:
5652   case Intrinsic::amdgcn_struct_tbuffer_load:
5653     return legalizeBufferLoad(MI, MRI, B, true, true);
5654   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5655   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5656   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5657   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5658   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5659   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5660   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5661   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5662   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5663   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5664   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5665   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5666   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5667   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5668   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5669   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5670   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5671   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5672   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5673   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5674   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5675   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5676   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5677   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5678   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5679   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5680   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5681   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5682   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5683   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5684     return legalizeBufferAtomic(MI, B, IntrID);
5685   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5686   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5687     Register DstReg = MI.getOperand(0).getReg();
5688     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5689       Function &F = B.getMF().getFunction();
5690       DiagnosticInfoUnsupported NoFpRet(
5691           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5692           DS_Error);
5693       F.getContext().diagnose(NoFpRet);
5694       B.buildUndef(DstReg);
5695       MI.eraseFromParent();
5696       return true;
5697     }
5698 
5699     return legalizeBufferAtomic(MI, B, IntrID);
5700   }
5701   case Intrinsic::amdgcn_atomic_inc:
5702     return legalizeAtomicIncDec(MI, B, true);
5703   case Intrinsic::amdgcn_atomic_dec:
5704     return legalizeAtomicIncDec(MI, B, false);
5705   case Intrinsic::trap:
5706     return legalizeTrapIntrinsic(MI, MRI, B);
5707   case Intrinsic::debugtrap:
5708     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5709   case Intrinsic::amdgcn_rsq_clamp:
5710     return legalizeRsqClampIntrinsic(MI, MRI, B);
5711   case Intrinsic::amdgcn_ds_fadd:
5712   case Intrinsic::amdgcn_ds_fmin:
5713   case Intrinsic::amdgcn_ds_fmax:
5714     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5715   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5716     return legalizeBVHIntrinsic(MI, B);
5717   default: {
5718     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5719             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5720       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5721     return true;
5722   }
5723   }
5724 
5725   return true;
5726 }
5727