1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx,
100                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(
112         TypeIdx,
113         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136   const unsigned Size = Ty.getSizeInBits();
137 
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB})
534       .legalFor({S32, S16, V2S16})
535       .clampMaxNumElementsStrict(0, S16, 2)
536       .scalarize(0)
537       .minScalar(0, S16)
538       .widenScalarToNextMultipleOf(0, 32)
539       .maxScalar(0, S32);
540 
541     getActionDefinitionsBuilder(G_MUL)
542       .legalFor({S32, S16, V2S16})
543       .clampMaxNumElementsStrict(0, S16, 2)
544       .scalarize(0)
545       .minScalar(0, S16)
546       .widenScalarToNextMultipleOf(0, 32)
547       .custom();
548     assert(ST.hasMad64_32());
549 
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551       .legalFor({S32, S16, V2S16}) // Clamp modifier
552       .minScalarOrElt(0, S16)
553       .clampMaxNumElementsStrict(0, S16, 2)
554       .scalarize(0)
555       .widenScalarToNextPow2(0, 32)
556       .lower();
557   } else if (ST.has16BitInsts()) {
558     getActionDefinitionsBuilder({G_ADD, G_SUB})
559       .legalFor({S32, S16})
560       .minScalar(0, S16)
561       .widenScalarToNextMultipleOf(0, 32)
562       .maxScalar(0, S32)
563       .scalarize(0);
564 
565     getActionDefinitionsBuilder(G_MUL)
566       .legalFor({S32, S16})
567       .scalarize(0)
568       .minScalar(0, S16)
569       .widenScalarToNextMultipleOf(0, 32)
570       .custom();
571     assert(ST.hasMad64_32());
572 
573     // Technically the saturating operations require clamp bit support, but this
574     // was introduced at the same time as 16-bit operations.
575     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576       .legalFor({S32, S16}) // Clamp modifier
577       .minScalar(0, S16)
578       .scalarize(0)
579       .widenScalarToNextPow2(0, 16)
580       .lower();
581 
582     // We're just lowering this, but it helps get a better result to try to
583     // coerce to the desired type first.
584     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585       .minScalar(0, S16)
586       .scalarize(0)
587       .lower();
588   } else {
589     getActionDefinitionsBuilder({G_ADD, G_SUB})
590       .legalFor({S32})
591       .widenScalarToNextMultipleOf(0, 32)
592       .clampScalar(0, S32, S32)
593       .scalarize(0);
594 
595     auto &Mul = getActionDefinitionsBuilder(G_MUL)
596       .legalFor({S32})
597       .scalarize(0)
598       .minScalar(0, S32)
599       .widenScalarToNextMultipleOf(0, 32);
600 
601     if (ST.hasMad64_32())
602       Mul.custom();
603     else
604       Mul.maxScalar(0, S32);
605 
606     if (ST.hasIntClamp()) {
607       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608         .legalFor({S32}) // Clamp modifier.
609         .scalarize(0)
610         .minScalarOrElt(0, S32)
611         .lower();
612     } else {
613       // Clamp bit support was added in VI, along with 16-bit operations.
614       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615         .minScalar(0, S32)
616         .scalarize(0)
617         .lower();
618     }
619 
620     // FIXME: DAG expansion gets better results. The widening uses the smaller
621     // range values and goes for the min/max lowering directly.
622     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623       .minScalar(0, S32)
624       .scalarize(0)
625       .lower();
626   }
627 
628   getActionDefinitionsBuilder(
629       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630       .customFor({S32, S64})
631       .clampScalar(0, S32, S64)
632       .widenScalarToNextPow2(0, 32)
633       .scalarize(0);
634 
635   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636                    .legalFor({S32})
637                    .maxScalar(0, S32);
638 
639   if (ST.hasVOP3PInsts()) {
640     Mulh
641       .clampMaxNumElements(0, S8, 2)
642       .lowerFor({V2S8});
643   }
644 
645   Mulh
646     .scalarize(0)
647     .lower();
648 
649   // Report legal for any types we can handle anywhere. For the cases only legal
650   // on the SALU, RegBankSelect will be able to re-legalize.
651   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653     .clampScalar(0, S32, S64)
654     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
655     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
656     .widenScalarToNextPow2(0)
657     .scalarize(0);
658 
659   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661     .legalFor({{S32, S1}, {S32, S32}})
662     .minScalar(0, S32)
663     .scalarize(0)
664     .lower();
665 
666   getActionDefinitionsBuilder(G_BITCAST)
667     // Don't worry about the size constraint.
668     .legalIf(all(isRegisterType(0), isRegisterType(1)))
669     .lower();
670 
671 
672   getActionDefinitionsBuilder(G_CONSTANT)
673     .legalFor({S1, S32, S64, S16, GlobalPtr,
674                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675     .legalIf(isPointer(0))
676     .clampScalar(0, S32, S64)
677     .widenScalarToNextPow2(0);
678 
679   getActionDefinitionsBuilder(G_FCONSTANT)
680     .legalFor({S32, S64, S16})
681     .clampScalar(0, S16, S64);
682 
683   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684       .legalIf(isRegisterType(0))
685       // s1 and s16 are special cases because they have legal operations on
686       // them, but don't really occupy registers in the normal way.
687       .legalFor({S1, S16})
688       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689       .clampScalarOrElt(0, S32, MaxScalar)
690       .widenScalarToNextPow2(0, 32)
691       .clampMaxNumElements(0, S32, 16);
692 
693   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694 
695   // If the amount is divergent, we have to do a wave reduction to get the
696   // maximum value, so this is expanded during RegBankSelect.
697   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698     .legalFor({{PrivatePtr, S32}});
699 
700   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701     .customIf(typeIsNot(0, PrivatePtr));
702 
703   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704 
705   auto &FPOpActions = getActionDefinitionsBuilder(
706     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707     .legalFor({S32, S64});
708   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709     .customFor({S32, S64});
710   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711     .customFor({S32, S64});
712 
713   if (ST.has16BitInsts()) {
714     if (ST.hasVOP3PInsts())
715       FPOpActions.legalFor({S16, V2S16});
716     else
717       FPOpActions.legalFor({S16});
718 
719     TrigActions.customFor({S16});
720     FDIVActions.customFor({S16});
721   }
722 
723   auto &MinNumMaxNum = getActionDefinitionsBuilder({
724       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725 
726   if (ST.hasVOP3PInsts()) {
727     MinNumMaxNum.customFor(FPTypesPK16)
728       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
729       .clampMaxNumElements(0, S16, 2)
730       .clampScalar(0, S16, S64)
731       .scalarize(0);
732   } else if (ST.has16BitInsts()) {
733     MinNumMaxNum.customFor(FPTypes16)
734       .clampScalar(0, S16, S64)
735       .scalarize(0);
736   } else {
737     MinNumMaxNum.customFor(FPTypesBase)
738       .clampScalar(0, S32, S64)
739       .scalarize(0);
740   }
741 
742   if (ST.hasVOP3PInsts())
743     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744 
745   FPOpActions
746     .scalarize(0)
747     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748 
749   TrigActions
750     .scalarize(0)
751     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752 
753   FDIVActions
754     .scalarize(0)
755     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756 
757   getActionDefinitionsBuilder({G_FNEG, G_FABS})
758     .legalFor(FPTypesPK16)
759     .clampMaxNumElementsStrict(0, S16, 2)
760     .scalarize(0)
761     .clampScalar(0, S16, S64);
762 
763   if (ST.has16BitInsts()) {
764     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765       .legalFor({S32, S64, S16})
766       .scalarize(0)
767       .clampScalar(0, S16, S64);
768   } else {
769     getActionDefinitionsBuilder(G_FSQRT)
770       .legalFor({S32, S64})
771       .scalarize(0)
772       .clampScalar(0, S32, S64);
773 
774     if (ST.hasFractBug()) {
775       getActionDefinitionsBuilder(G_FFLOOR)
776         .customFor({S64})
777         .legalFor({S32, S64})
778         .scalarize(0)
779         .clampScalar(0, S32, S64);
780     } else {
781       getActionDefinitionsBuilder(G_FFLOOR)
782         .legalFor({S32, S64})
783         .scalarize(0)
784         .clampScalar(0, S32, S64);
785     }
786   }
787 
788   getActionDefinitionsBuilder(G_FPTRUNC)
789     .legalFor({{S32, S64}, {S16, S32}})
790     .scalarize(0)
791     .lower();
792 
793   getActionDefinitionsBuilder(G_FPEXT)
794     .legalFor({{S64, S32}, {S32, S16}})
795     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796     .scalarize(0);
797 
798   getActionDefinitionsBuilder(G_FSUB)
799       // Use actual fsub instruction
800       .legalFor({S32})
801       // Must use fadd + fneg
802       .lowerFor({S64, S16, V2S16})
803       .scalarize(0)
804       .clampScalar(0, S32, S64);
805 
806   // Whether this is legal depends on the floating point mode for the function.
807   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
808   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
809     FMad.customFor({S32, S16});
810   else if (ST.hasMadMacF32Insts())
811     FMad.customFor({S32});
812   else if (ST.hasMadF16())
813     FMad.customFor({S16});
814   FMad.scalarize(0)
815       .lower();
816 
817   auto &FRem = getActionDefinitionsBuilder(G_FREM);
818   if (ST.has16BitInsts()) {
819     FRem.customFor({S16, S32, S64});
820   } else {
821     FRem.minScalar(0, S32)
822         .customFor({S32, S64});
823   }
824   FRem.scalarize(0);
825 
826   // TODO: Do we need to clamp maximum bitwidth?
827   getActionDefinitionsBuilder(G_TRUNC)
828     .legalIf(isScalar(0))
829     .legalFor({{V2S16, V2S32}})
830     .clampMaxNumElements(0, S16, 2)
831     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
832     // situations (like an invalid implicit use), we don't want to infinite loop
833     // in the legalizer.
834     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
835     .alwaysLegal();
836 
837   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
838     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
839                {S32, S1}, {S64, S1}, {S16, S1}})
840     .scalarize(0)
841     .clampScalar(0, S32, S64)
842     .widenScalarToNextPow2(1, 32);
843 
844   // TODO: Split s1->s64 during regbankselect for VALU.
845   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
846                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
847                     .lowerIf(typeIs(1, S1))
848                     .customFor({{S32, S64}, {S64, S64}});
849   if (ST.has16BitInsts())
850     IToFP.legalFor({{S16, S16}});
851   IToFP.clampScalar(1, S32, S64)
852        .minScalar(0, S32)
853        .scalarize(0)
854        .widenScalarToNextPow2(1);
855 
856   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
857     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
858     .customFor({{S64, S32}, {S64, S64}})
859     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
860   if (ST.has16BitInsts())
861     FPToI.legalFor({{S16, S16}});
862   else
863     FPToI.minScalar(1, S32);
864 
865   FPToI.minScalar(0, S32)
866        .widenScalarToNextPow2(0, 32)
867        .scalarize(0)
868        .lower();
869 
870   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
871       .customFor({S16, S32})
872       .scalarize(0)
873       .lower();
874 
875   // Lower roundeven into G_FRINT
876   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
877     .scalarize(0)
878     .lower();
879 
880   if (ST.has16BitInsts()) {
881     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
882       .legalFor({S16, S32, S64})
883       .clampScalar(0, S16, S64)
884       .scalarize(0);
885   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
886     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
887       .legalFor({S32, S64})
888       .clampScalar(0, S32, S64)
889       .scalarize(0);
890   } else {
891     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
892       .legalFor({S32})
893       .customFor({S64})
894       .clampScalar(0, S32, S64)
895       .scalarize(0);
896   }
897 
898   getActionDefinitionsBuilder(G_PTR_ADD)
899     .legalIf(all(isPointer(0), sameSize(0, 1)))
900     .scalarize(0)
901     .scalarSameSizeAs(1, 0);
902 
903   getActionDefinitionsBuilder(G_PTRMASK)
904     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
905     .scalarSameSizeAs(1, 0)
906     .scalarize(0);
907 
908   auto &CmpBuilder =
909     getActionDefinitionsBuilder(G_ICMP)
910     // The compare output type differs based on the register bank of the output,
911     // so make both s1 and s32 legal.
912     //
913     // Scalar compares producing output in scc will be promoted to s32, as that
914     // is the allocatable register type that will be needed for the copy from
915     // scc. This will be promoted during RegBankSelect, and we assume something
916     // before that won't try to use s32 result types.
917     //
918     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
919     // bank.
920     .legalForCartesianProduct(
921       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
922     .legalForCartesianProduct(
923       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
924   if (ST.has16BitInsts()) {
925     CmpBuilder.legalFor({{S1, S16}});
926   }
927 
928   CmpBuilder
929     .widenScalarToNextPow2(1)
930     .clampScalar(1, S32, S64)
931     .scalarize(0)
932     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
933 
934   getActionDefinitionsBuilder(G_FCMP)
935     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
936     .widenScalarToNextPow2(1)
937     .clampScalar(1, S32, S64)
938     .scalarize(0);
939 
940   // FIXME: fpow has a selection pattern that should move to custom lowering.
941   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
942   if (ST.has16BitInsts())
943     Exp2Ops.legalFor({S32, S16});
944   else
945     Exp2Ops.legalFor({S32});
946   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
947   Exp2Ops.scalarize(0);
948 
949   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
950   if (ST.has16BitInsts())
951     ExpOps.customFor({{S32}, {S16}});
952   else
953     ExpOps.customFor({S32});
954   ExpOps.clampScalar(0, MinScalarFPTy, S32)
955         .scalarize(0);
956 
957   getActionDefinitionsBuilder(G_FPOWI)
958     .clampScalar(0, MinScalarFPTy, S32)
959     .lower();
960 
961   // The 64-bit versions produce 32-bit results, but only on the SALU.
962   getActionDefinitionsBuilder(G_CTPOP)
963     .legalFor({{S32, S32}, {S32, S64}})
964     .clampScalar(0, S32, S32)
965     .widenScalarToNextPow2(1, 32)
966     .clampScalar(1, S32, S64)
967     .scalarize(0)
968     .widenScalarToNextPow2(0, 32);
969 
970 
971   // The hardware instructions return a different result on 0 than the generic
972   // instructions expect. The hardware produces -1, but these produce the
973   // bitwidth.
974   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
975     .scalarize(0)
976     .clampScalar(0, S32, S32)
977     .clampScalar(1, S32, S64)
978     .widenScalarToNextPow2(0, 32)
979     .widenScalarToNextPow2(1, 32)
980     .custom();
981 
982   // The 64-bit versions produce 32-bit results, but only on the SALU.
983   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
984     .legalFor({{S32, S32}, {S32, S64}})
985     .clampScalar(0, S32, S32)
986     .clampScalar(1, S32, S64)
987     .scalarize(0)
988     .widenScalarToNextPow2(0, 32)
989     .widenScalarToNextPow2(1, 32);
990 
991   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
992   // RegBankSelect.
993   getActionDefinitionsBuilder(G_BITREVERSE)
994     .legalFor({S32, S64})
995     .clampScalar(0, S32, S64)
996     .scalarize(0)
997     .widenScalarToNextPow2(0);
998 
999   if (ST.has16BitInsts()) {
1000     getActionDefinitionsBuilder(G_BSWAP)
1001       .legalFor({S16, S32, V2S16})
1002       .clampMaxNumElementsStrict(0, S16, 2)
1003       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004       // narrowScalar limitation.
1005       .widenScalarToNextPow2(0)
1006       .clampScalar(0, S16, S32)
1007       .scalarize(0);
1008 
1009     if (ST.hasVOP3PInsts()) {
1010       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011         .legalFor({S32, S16, V2S16})
1012         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1013         .clampMaxNumElements(0, S16, 2)
1014         .minScalar(0, S16)
1015         .widenScalarToNextPow2(0)
1016         .scalarize(0)
1017         .lower();
1018     } else {
1019       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1020         .legalFor({S32, S16})
1021         .widenScalarToNextPow2(0)
1022         .minScalar(0, S16)
1023         .scalarize(0)
1024         .lower();
1025     }
1026   } else {
1027     // TODO: Should have same legality without v_perm_b32
1028     getActionDefinitionsBuilder(G_BSWAP)
1029       .legalFor({S32})
1030       .lowerIf(scalarNarrowerThan(0, 32))
1031       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1032       // narrowScalar limitation.
1033       .widenScalarToNextPow2(0)
1034       .maxScalar(0, S32)
1035       .scalarize(0)
1036       .lower();
1037 
1038     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1039       .legalFor({S32})
1040       .minScalar(0, S32)
1041       .widenScalarToNextPow2(0)
1042       .scalarize(0)
1043       .lower();
1044   }
1045 
1046   getActionDefinitionsBuilder(G_INTTOPTR)
1047     // List the common cases
1048     .legalForCartesianProduct(AddrSpaces64, {S64})
1049     .legalForCartesianProduct(AddrSpaces32, {S32})
1050     .scalarize(0)
1051     // Accept any address space as long as the size matches
1052     .legalIf(sameSize(0, 1))
1053     .widenScalarIf(smallerThan(1, 0),
1054       [](const LegalityQuery &Query) {
1055         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1056       })
1057     .narrowScalarIf(largerThan(1, 0),
1058       [](const LegalityQuery &Query) {
1059         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1060       });
1061 
1062   getActionDefinitionsBuilder(G_PTRTOINT)
1063     // List the common cases
1064     .legalForCartesianProduct(AddrSpaces64, {S64})
1065     .legalForCartesianProduct(AddrSpaces32, {S32})
1066     .scalarize(0)
1067     // Accept any address space as long as the size matches
1068     .legalIf(sameSize(0, 1))
1069     .widenScalarIf(smallerThan(0, 1),
1070       [](const LegalityQuery &Query) {
1071         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1072       })
1073     .narrowScalarIf(
1074       largerThan(0, 1),
1075       [](const LegalityQuery &Query) {
1076         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1077       });
1078 
1079   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1080     .scalarize(0)
1081     .custom();
1082 
1083   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1084                                     bool IsLoad) -> bool {
1085     const LLT DstTy = Query.Types[0];
1086 
1087     // Split vector extloads.
1088     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1089 
1090     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1091       return true;
1092 
1093     const LLT PtrTy = Query.Types[1];
1094     unsigned AS = PtrTy.getAddressSpace();
1095     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1096       return true;
1097 
1098     // Catch weird sized loads that don't evenly divide into the access sizes
1099     // TODO: May be able to widen depending on alignment etc.
1100     unsigned NumRegs = (MemSize + 31) / 32;
1101     if (NumRegs == 3) {
1102       if (!ST.hasDwordx3LoadStores())
1103         return true;
1104     } else {
1105       // If the alignment allows, these should have been widened.
1106       if (!isPowerOf2_32(NumRegs))
1107         return true;
1108     }
1109 
1110     return false;
1111   };
1112 
1113   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1114   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1115   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1116 
1117   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1118   // LDS
1119   // TODO: Unsupported flat for SI.
1120 
1121   for (unsigned Op : {G_LOAD, G_STORE}) {
1122     const bool IsStore = Op == G_STORE;
1123 
1124     auto &Actions = getActionDefinitionsBuilder(Op);
1125     // Explicitly list some common cases.
1126     // TODO: Does this help compile time at all?
1127     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1128                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1129                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1130                                       {S64, GlobalPtr, S64, GlobalAlign32},
1131                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1132                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1133                                       {S32, GlobalPtr, S8, GlobalAlign8},
1134                                       {S32, GlobalPtr, S16, GlobalAlign16},
1135 
1136                                       {S32, LocalPtr, S32, 32},
1137                                       {S64, LocalPtr, S64, 32},
1138                                       {V2S32, LocalPtr, V2S32, 32},
1139                                       {S32, LocalPtr, S8, 8},
1140                                       {S32, LocalPtr, S16, 16},
1141                                       {V2S16, LocalPtr, S32, 32},
1142 
1143                                       {S32, PrivatePtr, S32, 32},
1144                                       {S32, PrivatePtr, S8, 8},
1145                                       {S32, PrivatePtr, S16, 16},
1146                                       {V2S16, PrivatePtr, S32, 32},
1147 
1148                                       {S32, ConstantPtr, S32, GlobalAlign32},
1149                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1150                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1151                                       {S64, ConstantPtr, S64, GlobalAlign32},
1152                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1153     Actions.legalIf(
1154       [=](const LegalityQuery &Query) -> bool {
1155         return isLoadStoreLegal(ST, Query);
1156       });
1157 
1158     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1159     // 64-bits.
1160     //
1161     // TODO: Should generalize bitcast action into coerce, which will also cover
1162     // inserting addrspacecasts.
1163     Actions.customIf(typeIs(1, Constant32Ptr));
1164 
1165     // Turn any illegal element vectors into something easier to deal
1166     // with. These will ultimately produce 32-bit scalar shifts to extract the
1167     // parts anyway.
1168     //
1169     // For odd 16-bit element vectors, prefer to split those into pieces with
1170     // 16-bit vector parts.
1171     Actions.bitcastIf(
1172       [=](const LegalityQuery &Query) -> bool {
1173         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1174                                           Query.MMODescrs[0].MemoryTy);
1175       }, bitcastToRegisterType(0));
1176 
1177     if (!IsStore) {
1178       // Widen suitably aligned loads by loading extra bytes. The standard
1179       // legalization actions can't properly express widening memory operands.
1180       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1181         return shouldWidenLoad(ST, Query, G_LOAD);
1182       });
1183     }
1184 
1185     // FIXME: load/store narrowing should be moved to lower action
1186     Actions
1187         .narrowScalarIf(
1188             [=](const LegalityQuery &Query) -> bool {
1189               return !Query.Types[0].isVector() &&
1190                      needToSplitMemOp(Query, Op == G_LOAD);
1191             },
1192             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1193               const LLT DstTy = Query.Types[0];
1194               const LLT PtrTy = Query.Types[1];
1195 
1196               const unsigned DstSize = DstTy.getSizeInBits();
1197               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198 
1199               // Split extloads.
1200               if (DstSize > MemSize)
1201                 return std::make_pair(0, LLT::scalar(MemSize));
1202 
1203               unsigned MaxSize = maxSizeForAddrSpace(ST,
1204                                                      PtrTy.getAddressSpace(),
1205                                                      Op == G_LOAD);
1206               if (MemSize > MaxSize)
1207                 return std::make_pair(0, LLT::scalar(MaxSize));
1208 
1209               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1210               return std::make_pair(0, LLT::scalar(Align));
1211             })
1212         .fewerElementsIf(
1213             [=](const LegalityQuery &Query) -> bool {
1214               return Query.Types[0].isVector() &&
1215                      needToSplitMemOp(Query, Op == G_LOAD);
1216             },
1217             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1218               const LLT DstTy = Query.Types[0];
1219               const LLT PtrTy = Query.Types[1];
1220 
1221               LLT EltTy = DstTy.getElementType();
1222               unsigned MaxSize = maxSizeForAddrSpace(ST,
1223                                                      PtrTy.getAddressSpace(),
1224                                                      Op == G_LOAD);
1225 
1226               // FIXME: Handle widened to power of 2 results better. This ends
1227               // up scalarizing.
1228               // FIXME: 3 element stores scalarized on SI
1229 
1230               // Split if it's too large for the address space.
1231               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1232               if (MemSize > MaxSize) {
1233                 unsigned NumElts = DstTy.getNumElements();
1234                 unsigned EltSize = EltTy.getSizeInBits();
1235 
1236                 if (MaxSize % EltSize == 0) {
1237                   return std::make_pair(
1238                       0, LLT::scalarOrVector(
1239                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1240                 }
1241 
1242                 unsigned NumPieces = MemSize / MaxSize;
1243 
1244                 // FIXME: Refine when odd breakdowns handled
1245                 // The scalars will need to be re-legalized.
1246                 if (NumPieces == 1 || NumPieces >= NumElts ||
1247                     NumElts % NumPieces != 0)
1248                   return std::make_pair(0, EltTy);
1249 
1250                 return std::make_pair(
1251                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1252               }
1253 
1254               // FIXME: We could probably handle weird extending loads better.
1255               if (DstTy.getSizeInBits() > MemSize)
1256                 return std::make_pair(0, EltTy);
1257 
1258               unsigned EltSize = EltTy.getSizeInBits();
1259               unsigned DstSize = DstTy.getSizeInBits();
1260               if (!isPowerOf2_32(DstSize)) {
1261                 // We're probably decomposing an odd sized store. Try to split
1262                 // to the widest type. TODO: Account for alignment. As-is it
1263                 // should be OK, since the new parts will be further legalized.
1264                 unsigned FloorSize = PowerOf2Floor(DstSize);
1265                 return std::make_pair(
1266                     0, LLT::scalarOrVector(
1267                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1268               }
1269 
1270               // May need relegalization for the scalars.
1271               return std::make_pair(0, EltTy);
1272             })
1273     .minScalar(0, S32)
1274     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1275     .widenScalarToNextPow2(0)
1276     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1277     .lower();
1278   }
1279 
1280   // FIXME: Unaligned accesses not lowered.
1281   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1282                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1283                                                   {S32, GlobalPtr, S16, 2 * 8},
1284                                                   {S32, LocalPtr, S8, 8},
1285                                                   {S32, LocalPtr, S16, 16},
1286                                                   {S32, PrivatePtr, S8, 8},
1287                                                   {S32, PrivatePtr, S16, 16},
1288                                                   {S32, ConstantPtr, S8, 8},
1289                                                   {S32, ConstantPtr, S16, 2 * 8}})
1290                        .legalIf(
1291                          [=](const LegalityQuery &Query) -> bool {
1292                            return isLoadStoreLegal(ST, Query);
1293                          });
1294 
1295   if (ST.hasFlatAddressSpace()) {
1296     ExtLoads.legalForTypesWithMemDesc(
1297         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1298   }
1299 
1300   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1301   // 64-bits.
1302   //
1303   // TODO: Should generalize bitcast action into coerce, which will also cover
1304   // inserting addrspacecasts.
1305   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1306 
1307   ExtLoads.clampScalar(0, S32, S32)
1308           .widenScalarToNextPow2(0)
1309           .lower();
1310 
1311   auto &Atomics = getActionDefinitionsBuilder(
1312     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1313      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1314      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1315      G_ATOMICRMW_UMIN})
1316     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1317                {S64, GlobalPtr}, {S64, LocalPtr},
1318                {S32, RegionPtr}, {S64, RegionPtr}});
1319   if (ST.hasFlatAddressSpace()) {
1320     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1321   }
1322 
1323   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1324   if (ST.hasLDSFPAtomicAdd()) {
1325     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1326     if (ST.hasGFX90AInsts())
1327       Atomic.legalFor({{S64, LocalPtr}});
1328     if (ST.hasGFX940Insts())
1329       Atomic.legalFor({{V2S16, LocalPtr}});
1330   }
1331   if (ST.hasAtomicFaddInsts())
1332     Atomic.legalFor({{S32, GlobalPtr}});
1333 
1334   if (ST.hasGFX90AInsts()) {
1335     // These are legal with some caveats, and should have undergone expansion in
1336     // the IR in most situations
1337     // TODO: Move atomic expansion into legalizer
1338     // TODO: Also supports <2 x f16>
1339     Atomic.legalFor({
1340         {S32, GlobalPtr},
1341         {S64, GlobalPtr},
1342         {S64, FlatPtr}
1343       });
1344   }
1345 
1346   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1347   // demarshalling
1348   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1349     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1350                 {S32, FlatPtr}, {S64, FlatPtr}})
1351     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1352                {S32, RegionPtr}, {S64, RegionPtr}});
1353   // TODO: Pointer types, any 32-bit or 64-bit vector
1354 
1355   // Condition should be s32 for scalar, s1 for vector.
1356   getActionDefinitionsBuilder(G_SELECT)
1357       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1358                                  LocalPtr, FlatPtr, PrivatePtr,
1359                                  LLT::fixed_vector(2, LocalPtr),
1360                                  LLT::fixed_vector(2, PrivatePtr)},
1361                                 {S1, S32})
1362       .clampScalar(0, S16, S64)
1363       .scalarize(1)
1364       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1365       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1366       .clampMaxNumElements(0, S32, 2)
1367       .clampMaxNumElements(0, LocalPtr, 2)
1368       .clampMaxNumElements(0, PrivatePtr, 2)
1369       .scalarize(0)
1370       .widenScalarToNextPow2(0)
1371       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1372 
1373   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1374   // be more flexible with the shift amount type.
1375   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1376     .legalFor({{S32, S32}, {S64, S32}});
1377   if (ST.has16BitInsts()) {
1378     if (ST.hasVOP3PInsts()) {
1379       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1380             .clampMaxNumElements(0, S16, 2);
1381     } else
1382       Shifts.legalFor({{S16, S16}});
1383 
1384     // TODO: Support 16-bit shift amounts for all types
1385     Shifts.widenScalarIf(
1386       [=](const LegalityQuery &Query) {
1387         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1388         // 32-bit amount.
1389         const LLT ValTy = Query.Types[0];
1390         const LLT AmountTy = Query.Types[1];
1391         return ValTy.getSizeInBits() <= 16 &&
1392                AmountTy.getSizeInBits() < 16;
1393       }, changeTo(1, S16));
1394     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1395     Shifts.clampScalar(1, S32, S32);
1396     Shifts.widenScalarToNextPow2(0, 16);
1397     Shifts.clampScalar(0, S16, S64);
1398 
1399     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1400       .minScalar(0, S16)
1401       .scalarize(0)
1402       .lower();
1403   } else {
1404     // Make sure we legalize the shift amount type first, as the general
1405     // expansion for the shifted type will produce much worse code if it hasn't
1406     // been truncated already.
1407     Shifts.clampScalar(1, S32, S32);
1408     Shifts.widenScalarToNextPow2(0, 32);
1409     Shifts.clampScalar(0, S32, S64);
1410 
1411     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1412       .minScalar(0, S32)
1413       .scalarize(0)
1414       .lower();
1415   }
1416   Shifts.scalarize(0);
1417 
1418   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1419     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1420     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1421     unsigned IdxTypeIdx = 2;
1422 
1423     getActionDefinitionsBuilder(Op)
1424       .customIf([=](const LegalityQuery &Query) {
1425           const LLT EltTy = Query.Types[EltTypeIdx];
1426           const LLT VecTy = Query.Types[VecTypeIdx];
1427           const LLT IdxTy = Query.Types[IdxTypeIdx];
1428           const unsigned EltSize = EltTy.getSizeInBits();
1429           return (EltSize == 32 || EltSize == 64) &&
1430                   VecTy.getSizeInBits() % 32 == 0 &&
1431                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1432                   IdxTy.getSizeInBits() == 32;
1433         })
1434       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1435                  bitcastToVectorElement32(VecTypeIdx))
1436       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1437       .bitcastIf(
1438         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1439         [=](const LegalityQuery &Query) {
1440           // For > 64-bit element types, try to turn this into a 64-bit
1441           // element vector since we may be able to do better indexing
1442           // if this is scalar. If not, fall back to 32.
1443           const LLT EltTy = Query.Types[EltTypeIdx];
1444           const LLT VecTy = Query.Types[VecTypeIdx];
1445           const unsigned DstEltSize = EltTy.getSizeInBits();
1446           const unsigned VecSize = VecTy.getSizeInBits();
1447 
1448           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1449           return std::make_pair(
1450               VecTypeIdx,
1451               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1452         })
1453       .clampScalar(EltTypeIdx, S32, S64)
1454       .clampScalar(VecTypeIdx, S32, S64)
1455       .clampScalar(IdxTypeIdx, S32, S32)
1456       .clampMaxNumElements(VecTypeIdx, S32, 32)
1457       // TODO: Clamp elements for 64-bit vectors?
1458       // It should only be necessary with variable indexes.
1459       // As a last resort, lower to the stack
1460       .lower();
1461   }
1462 
1463   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1464     .unsupportedIf([=](const LegalityQuery &Query) {
1465         const LLT &EltTy = Query.Types[1].getElementType();
1466         return Query.Types[0] != EltTy;
1467       });
1468 
1469   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1470     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1471     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1472 
1473     // FIXME: Doesn't handle extract of illegal sizes.
1474     getActionDefinitionsBuilder(Op)
1475       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1476       .lowerIf([=](const LegalityQuery &Query) {
1477           // Sub-vector(or single element) insert and extract.
1478           // TODO: verify immediate offset here since lower only works with
1479           // whole elements.
1480           const LLT BigTy = Query.Types[BigTyIdx];
1481           return BigTy.isVector();
1482         })
1483       // FIXME: Multiples of 16 should not be legal.
1484       .legalIf([=](const LegalityQuery &Query) {
1485           const LLT BigTy = Query.Types[BigTyIdx];
1486           const LLT LitTy = Query.Types[LitTyIdx];
1487           return (BigTy.getSizeInBits() % 32 == 0) &&
1488                  (LitTy.getSizeInBits() % 16 == 0);
1489         })
1490       .widenScalarIf(
1491         [=](const LegalityQuery &Query) {
1492           const LLT BigTy = Query.Types[BigTyIdx];
1493           return (BigTy.getScalarSizeInBits() < 16);
1494         },
1495         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1496       .widenScalarIf(
1497         [=](const LegalityQuery &Query) {
1498           const LLT LitTy = Query.Types[LitTyIdx];
1499           return (LitTy.getScalarSizeInBits() < 16);
1500         },
1501         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1502       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1503       .widenScalarToNextPow2(BigTyIdx, 32);
1504 
1505   }
1506 
1507   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1508     .legalForCartesianProduct(AllS32Vectors, {S32})
1509     .legalForCartesianProduct(AllS64Vectors, {S64})
1510     .clampNumElements(0, V16S32, V32S32)
1511     .clampNumElements(0, V2S64, V16S64)
1512     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1513 
1514   if (ST.hasScalarPackInsts()) {
1515     BuildVector
1516       // FIXME: Should probably widen s1 vectors straight to s32
1517       .minScalarOrElt(0, S16)
1518       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1519       .minScalar(1, S32);
1520 
1521     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1522       .legalFor({V2S16, S32})
1523       .lower();
1524     BuildVector.minScalarOrElt(0, S32);
1525   } else {
1526     BuildVector.customFor({V2S16, S16});
1527     BuildVector.minScalarOrElt(0, S32);
1528 
1529     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1530       .customFor({V2S16, S32})
1531       .lower();
1532   }
1533 
1534   BuildVector.legalIf(isRegisterType(0));
1535 
1536   // FIXME: Clamp maximum size
1537   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1538     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1539     .clampMaxNumElements(0, S32, 32)
1540     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1541     .clampMaxNumElements(0, S16, 64);
1542 
1543   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1544   // pre-legalize.
1545   if (ST.hasVOP3PInsts()) {
1546     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1547       .customFor({V2S16, V2S16})
1548       .lower();
1549   } else
1550     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1551 
1552   // Merge/Unmerge
1553   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1554     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1555     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1556 
1557     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1558       const LLT Ty = Query.Types[TypeIdx];
1559       if (Ty.isVector()) {
1560         const LLT &EltTy = Ty.getElementType();
1561         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1562           return true;
1563         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1564           return true;
1565       }
1566       return false;
1567     };
1568 
1569     auto &Builder = getActionDefinitionsBuilder(Op)
1570       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1571       .lowerFor({{S16, V2S16}})
1572       .lowerIf([=](const LegalityQuery &Query) {
1573           const LLT BigTy = Query.Types[BigTyIdx];
1574           return BigTy.getSizeInBits() == 32;
1575         })
1576       // Try to widen to s16 first for small types.
1577       // TODO: Only do this on targets with legal s16 shifts
1578       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1579       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1580       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1581       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1582                            elementTypeIs(1, S16)),
1583                        changeTo(1, V2S16))
1584       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1585       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1586       // valid.
1587       .clampScalar(LitTyIdx, S32, S512)
1588       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1589       // Break up vectors with weird elements into scalars
1590       .fewerElementsIf(
1591         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1592         scalarize(0))
1593       .fewerElementsIf(
1594         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1595         scalarize(1))
1596       .clampScalar(BigTyIdx, S32, MaxScalar);
1597 
1598     if (Op == G_MERGE_VALUES) {
1599       Builder.widenScalarIf(
1600         // TODO: Use 16-bit shifts if legal for 8-bit values?
1601         [=](const LegalityQuery &Query) {
1602           const LLT Ty = Query.Types[LitTyIdx];
1603           return Ty.getSizeInBits() < 32;
1604         },
1605         changeTo(LitTyIdx, S32));
1606     }
1607 
1608     Builder.widenScalarIf(
1609       [=](const LegalityQuery &Query) {
1610         const LLT Ty = Query.Types[BigTyIdx];
1611         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1612           Ty.getSizeInBits() % 16 != 0;
1613       },
1614       [=](const LegalityQuery &Query) {
1615         // Pick the next power of 2, or a multiple of 64 over 128.
1616         // Whichever is smaller.
1617         const LLT &Ty = Query.Types[BigTyIdx];
1618         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1619         if (NewSizeInBits >= 256) {
1620           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1621           if (RoundedTo < NewSizeInBits)
1622             NewSizeInBits = RoundedTo;
1623         }
1624         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1625       })
1626       // Any vectors left are the wrong size. Scalarize them.
1627       .scalarize(0)
1628       .scalarize(1);
1629   }
1630 
1631   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1632   // RegBankSelect.
1633   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1634     .legalFor({{S32}, {S64}});
1635 
1636   if (ST.hasVOP3PInsts()) {
1637     SextInReg.lowerFor({{V2S16}})
1638       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1639       // get more vector shift opportunities, since we'll get those when
1640       // expanded.
1641       .clampMaxNumElementsStrict(0, S16, 2);
1642   } else if (ST.has16BitInsts()) {
1643     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1644   } else {
1645     // Prefer to promote to s32 before lowering if we don't have 16-bit
1646     // shifts. This avoid a lot of intermediate truncate and extend operations.
1647     SextInReg.lowerFor({{S32}, {S64}});
1648   }
1649 
1650   SextInReg
1651     .scalarize(0)
1652     .clampScalar(0, S32, S64)
1653     .lower();
1654 
1655   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1656     .scalarize(0)
1657     .lower();
1658 
1659   // TODO: Only Try to form v2s16 with legal packed instructions.
1660   getActionDefinitionsBuilder(G_FSHR)
1661     .legalFor({{S32, S32}})
1662     .lowerFor({{V2S16, V2S16}})
1663     .clampMaxNumElementsStrict(0, S16, 2)
1664     .scalarize(0)
1665     .lower();
1666 
1667   if (ST.hasVOP3PInsts()) {
1668     getActionDefinitionsBuilder(G_FSHL)
1669       .lowerFor({{V2S16, V2S16}})
1670       .clampMaxNumElementsStrict(0, S16, 2)
1671       .scalarize(0)
1672       .lower();
1673   } else {
1674     getActionDefinitionsBuilder(G_FSHL)
1675       .scalarize(0)
1676       .lower();
1677   }
1678 
1679   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1680     .legalFor({S64});
1681 
1682   getActionDefinitionsBuilder(G_FENCE)
1683     .alwaysLegal();
1684 
1685   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1686       .scalarize(0)
1687       .minScalar(0, S32)
1688       .lower();
1689 
1690   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1691       .legalFor({{S32, S32}, {S64, S32}})
1692       .clampScalar(1, S32, S32)
1693       .clampScalar(0, S32, S64)
1694       .widenScalarToNextPow2(0)
1695       .scalarize(0);
1696 
1697   getActionDefinitionsBuilder({
1698       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1699       G_FCOPYSIGN,
1700 
1701       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1702       G_ATOMICRMW_NAND,
1703       G_ATOMICRMW_FSUB,
1704       G_READ_REGISTER,
1705       G_WRITE_REGISTER,
1706 
1707       G_SADDO, G_SSUBO,
1708 
1709        // TODO: Implement
1710       G_FMINIMUM, G_FMAXIMUM}).lower();
1711 
1712   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1713       .lower();
1714 
1715   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1716         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1717         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1718     .unsupported();
1719 
1720   getLegacyLegalizerInfo().computeTables();
1721   verify(*ST.getInstrInfo());
1722 }
1723 
1724 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1725                                          MachineInstr &MI) const {
1726   MachineIRBuilder &B = Helper.MIRBuilder;
1727   MachineRegisterInfo &MRI = *B.getMRI();
1728 
1729   switch (MI.getOpcode()) {
1730   case TargetOpcode::G_ADDRSPACE_CAST:
1731     return legalizeAddrSpaceCast(MI, MRI, B);
1732   case TargetOpcode::G_FRINT:
1733     return legalizeFrint(MI, MRI, B);
1734   case TargetOpcode::G_FCEIL:
1735     return legalizeFceil(MI, MRI, B);
1736   case TargetOpcode::G_FREM:
1737     return legalizeFrem(MI, MRI, B);
1738   case TargetOpcode::G_INTRINSIC_TRUNC:
1739     return legalizeIntrinsicTrunc(MI, MRI, B);
1740   case TargetOpcode::G_SITOFP:
1741     return legalizeITOFP(MI, MRI, B, true);
1742   case TargetOpcode::G_UITOFP:
1743     return legalizeITOFP(MI, MRI, B, false);
1744   case TargetOpcode::G_FPTOSI:
1745     return legalizeFPTOI(MI, MRI, B, true);
1746   case TargetOpcode::G_FPTOUI:
1747     return legalizeFPTOI(MI, MRI, B, false);
1748   case TargetOpcode::G_FMINNUM:
1749   case TargetOpcode::G_FMAXNUM:
1750   case TargetOpcode::G_FMINNUM_IEEE:
1751   case TargetOpcode::G_FMAXNUM_IEEE:
1752     return legalizeMinNumMaxNum(Helper, MI);
1753   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1754     return legalizeExtractVectorElt(MI, MRI, B);
1755   case TargetOpcode::G_INSERT_VECTOR_ELT:
1756     return legalizeInsertVectorElt(MI, MRI, B);
1757   case TargetOpcode::G_SHUFFLE_VECTOR:
1758     return legalizeShuffleVector(MI, MRI, B);
1759   case TargetOpcode::G_FSIN:
1760   case TargetOpcode::G_FCOS:
1761     return legalizeSinCos(MI, MRI, B);
1762   case TargetOpcode::G_GLOBAL_VALUE:
1763     return legalizeGlobalValue(MI, MRI, B);
1764   case TargetOpcode::G_LOAD:
1765   case TargetOpcode::G_SEXTLOAD:
1766   case TargetOpcode::G_ZEXTLOAD:
1767     return legalizeLoad(Helper, MI);
1768   case TargetOpcode::G_FMAD:
1769     return legalizeFMad(MI, MRI, B);
1770   case TargetOpcode::G_FDIV:
1771     return legalizeFDIV(MI, MRI, B);
1772   case TargetOpcode::G_UDIV:
1773   case TargetOpcode::G_UREM:
1774   case TargetOpcode::G_UDIVREM:
1775     return legalizeUnsignedDIV_REM(MI, MRI, B);
1776   case TargetOpcode::G_SDIV:
1777   case TargetOpcode::G_SREM:
1778   case TargetOpcode::G_SDIVREM:
1779     return legalizeSignedDIV_REM(MI, MRI, B);
1780   case TargetOpcode::G_ATOMIC_CMPXCHG:
1781     return legalizeAtomicCmpXChg(MI, MRI, B);
1782   case TargetOpcode::G_FLOG:
1783     return legalizeFlog(MI, B, numbers::ln2f);
1784   case TargetOpcode::G_FLOG10:
1785     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1786   case TargetOpcode::G_FEXP:
1787     return legalizeFExp(MI, B);
1788   case TargetOpcode::G_FPOW:
1789     return legalizeFPow(MI, B);
1790   case TargetOpcode::G_FFLOOR:
1791     return legalizeFFloor(MI, MRI, B);
1792   case TargetOpcode::G_BUILD_VECTOR:
1793     return legalizeBuildVector(MI, MRI, B);
1794   case TargetOpcode::G_MUL:
1795     return legalizeMul(Helper, MI);
1796   case TargetOpcode::G_CTLZ:
1797   case TargetOpcode::G_CTTZ:
1798     return legalizeCTLZ_CTTZ(MI, MRI, B);
1799   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1800     return legalizeFPTruncRound(MI, B);
1801   default:
1802     return false;
1803   }
1804 
1805   llvm_unreachable("expected switch to return");
1806 }
1807 
1808 Register AMDGPULegalizerInfo::getSegmentAperture(
1809   unsigned AS,
1810   MachineRegisterInfo &MRI,
1811   MachineIRBuilder &B) const {
1812   MachineFunction &MF = B.getMF();
1813   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1814   const LLT S32 = LLT::scalar(32);
1815 
1816   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1817 
1818   if (ST.hasApertureRegs()) {
1819     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1820     // getreg.
1821     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1822         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1823         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1824     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1825         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1826         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1827     unsigned Encoding =
1828         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1829         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1830         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1831 
1832     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1833 
1834     B.buildInstr(AMDGPU::S_GETREG_B32)
1835       .addDef(GetReg)
1836       .addImm(Encoding);
1837     MRI.setType(GetReg, S32);
1838 
1839     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1840     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1841   }
1842 
1843   // TODO: can we be smarter about machine pointer info?
1844   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1845   Register LoadAddr = MRI.createGenericVirtualRegister(
1846     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1847   // For code object version 5, private_base and shared_base are passed through
1848   // implicit kernargs.
1849   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1850     AMDGPUTargetLowering::ImplicitParameter Param =
1851         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1852                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1853     uint64_t Offset =
1854         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1855 
1856     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1857         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1858 
1859     if (!loadInputValue(KernargPtrReg, B,
1860                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1861       return Register();
1862 
1863     MachineMemOperand *MMO = MF.getMachineMemOperand(
1864         PtrInfo,
1865         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1866             MachineMemOperand::MOInvariant,
1867         LLT::scalar(32), commonAlignment(Align(64), Offset));
1868 
1869     // Pointer address
1870     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1871                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1872     // Load address
1873     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1874   }
1875 
1876   Register QueuePtr = MRI.createGenericVirtualRegister(
1877     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1878 
1879   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1880     return Register();
1881 
1882   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1883   // private_segment_aperture_base_hi.
1884   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1885 
1886   MachineMemOperand *MMO = MF.getMachineMemOperand(
1887       PtrInfo,
1888       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1889           MachineMemOperand::MOInvariant,
1890       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1891 
1892   B.buildPtrAdd(LoadAddr, QueuePtr,
1893                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1894   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1895 }
1896 
1897 /// Return true if the value is a known valid address, such that a null check is
1898 /// not necessary.
1899 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1900                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1901   MachineInstr *Def = MRI.getVRegDef(Val);
1902   switch (Def->getOpcode()) {
1903   case AMDGPU::G_FRAME_INDEX:
1904   case AMDGPU::G_GLOBAL_VALUE:
1905   case AMDGPU::G_BLOCK_ADDR:
1906     return true;
1907   case AMDGPU::G_CONSTANT: {
1908     const ConstantInt *CI = Def->getOperand(1).getCImm();
1909     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1910   }
1911   default:
1912     return false;
1913   }
1914 
1915   return false;
1916 }
1917 
1918 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1919   MachineInstr &MI, MachineRegisterInfo &MRI,
1920   MachineIRBuilder &B) const {
1921   MachineFunction &MF = B.getMF();
1922 
1923   const LLT S32 = LLT::scalar(32);
1924   Register Dst = MI.getOperand(0).getReg();
1925   Register Src = MI.getOperand(1).getReg();
1926 
1927   LLT DstTy = MRI.getType(Dst);
1928   LLT SrcTy = MRI.getType(Src);
1929   unsigned DestAS = DstTy.getAddressSpace();
1930   unsigned SrcAS = SrcTy.getAddressSpace();
1931 
1932   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1933   // vector element.
1934   assert(!DstTy.isVector());
1935 
1936   const AMDGPUTargetMachine &TM
1937     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1938 
1939   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1940     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1941     return true;
1942   }
1943 
1944   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1945       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1946        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1947     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1948       // Extract low 32-bits of the pointer.
1949       B.buildExtract(Dst, Src, 0);
1950       MI.eraseFromParent();
1951       return true;
1952     }
1953 
1954     unsigned NullVal = TM.getNullPointerValue(DestAS);
1955 
1956     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1957     auto FlatNull = B.buildConstant(SrcTy, 0);
1958 
1959     // Extract low 32-bits of the pointer.
1960     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1961 
1962     auto CmpRes =
1963         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1964     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1965 
1966     MI.eraseFromParent();
1967     return true;
1968   }
1969 
1970   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1971       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1972        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1973     if (!ST.hasFlatAddressSpace())
1974       return false;
1975 
1976     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1977     if (!ApertureReg.isValid())
1978       return false;
1979 
1980     // Coerce the type of the low half of the result so we can use merge_values.
1981     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1982 
1983     // TODO: Should we allow mismatched types but matching sizes in merges to
1984     // avoid the ptrtoint?
1985     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1986 
1987     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1988       B.buildCopy(Dst, BuildPtr);
1989       MI.eraseFromParent();
1990       return true;
1991     }
1992 
1993     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1994     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1995 
1996     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1997                               SegmentNull.getReg(0));
1998 
1999     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2000 
2001     MI.eraseFromParent();
2002     return true;
2003   }
2004 
2005   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2006       SrcTy.getSizeInBits() == 64) {
2007     // Truncate.
2008     B.buildExtract(Dst, Src, 0);
2009     MI.eraseFromParent();
2010     return true;
2011   }
2012 
2013   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2014       DstTy.getSizeInBits() == 64) {
2015     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2016     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2017 
2018     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2019     // another. Merge operands are required to be the same type, but creating an
2020     // extra ptrtoint would be kind of pointless.
2021     auto HighAddr = B.buildConstant(
2022         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
2023     B.buildMerge(Dst, {Src, HighAddr});
2024     MI.eraseFromParent();
2025     return true;
2026   }
2027 
2028   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2029       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2030 
2031   LLVMContext &Ctx = MF.getFunction().getContext();
2032   Ctx.diagnose(InvalidAddrSpaceCast);
2033   B.buildUndef(Dst);
2034   MI.eraseFromParent();
2035   return true;
2036 }
2037 
2038 bool AMDGPULegalizerInfo::legalizeFrint(
2039   MachineInstr &MI, MachineRegisterInfo &MRI,
2040   MachineIRBuilder &B) const {
2041   Register Src = MI.getOperand(1).getReg();
2042   LLT Ty = MRI.getType(Src);
2043   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2044 
2045   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2046   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2047 
2048   auto C1 = B.buildFConstant(Ty, C1Val);
2049   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2050 
2051   // TODO: Should this propagate fast-math-flags?
2052   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2053   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2054 
2055   auto C2 = B.buildFConstant(Ty, C2Val);
2056   auto Fabs = B.buildFAbs(Ty, Src);
2057 
2058   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2059   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2060   MI.eraseFromParent();
2061   return true;
2062 }
2063 
2064 bool AMDGPULegalizerInfo::legalizeFceil(
2065   MachineInstr &MI, MachineRegisterInfo &MRI,
2066   MachineIRBuilder &B) const {
2067 
2068   const LLT S1 = LLT::scalar(1);
2069   const LLT S64 = LLT::scalar(64);
2070 
2071   Register Src = MI.getOperand(1).getReg();
2072   assert(MRI.getType(Src) == S64);
2073 
2074   // result = trunc(src)
2075   // if (src > 0.0 && src != result)
2076   //   result += 1.0
2077 
2078   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2079 
2080   const auto Zero = B.buildFConstant(S64, 0.0);
2081   const auto One = B.buildFConstant(S64, 1.0);
2082   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2083   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2084   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2085   auto Add = B.buildSelect(S64, And, One, Zero);
2086 
2087   // TODO: Should this propagate fast-math-flags?
2088   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2089   MI.eraseFromParent();
2090   return true;
2091 }
2092 
2093 bool AMDGPULegalizerInfo::legalizeFrem(
2094   MachineInstr &MI, MachineRegisterInfo &MRI,
2095   MachineIRBuilder &B) const {
2096     Register DstReg = MI.getOperand(0).getReg();
2097     Register Src0Reg = MI.getOperand(1).getReg();
2098     Register Src1Reg = MI.getOperand(2).getReg();
2099     auto Flags = MI.getFlags();
2100     LLT Ty = MRI.getType(DstReg);
2101 
2102     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2103     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2104     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2105     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2106     MI.eraseFromParent();
2107     return true;
2108 }
2109 
2110 static MachineInstrBuilder extractF64Exponent(Register Hi,
2111                                               MachineIRBuilder &B) {
2112   const unsigned FractBits = 52;
2113   const unsigned ExpBits = 11;
2114   LLT S32 = LLT::scalar(32);
2115 
2116   auto Const0 = B.buildConstant(S32, FractBits - 32);
2117   auto Const1 = B.buildConstant(S32, ExpBits);
2118 
2119   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2120     .addUse(Hi)
2121     .addUse(Const0.getReg(0))
2122     .addUse(Const1.getReg(0));
2123 
2124   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2125 }
2126 
2127 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2128   MachineInstr &MI, MachineRegisterInfo &MRI,
2129   MachineIRBuilder &B) const {
2130   const LLT S1 = LLT::scalar(1);
2131   const LLT S32 = LLT::scalar(32);
2132   const LLT S64 = LLT::scalar(64);
2133 
2134   Register Src = MI.getOperand(1).getReg();
2135   assert(MRI.getType(Src) == S64);
2136 
2137   // TODO: Should this use extract since the low half is unused?
2138   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2139   Register Hi = Unmerge.getReg(1);
2140 
2141   // Extract the upper half, since this is where we will find the sign and
2142   // exponent.
2143   auto Exp = extractF64Exponent(Hi, B);
2144 
2145   const unsigned FractBits = 52;
2146 
2147   // Extract the sign bit.
2148   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2149   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2150 
2151   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2152 
2153   const auto Zero32 = B.buildConstant(S32, 0);
2154 
2155   // Extend back to 64-bits.
2156   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2157 
2158   auto Shr = B.buildAShr(S64, FractMask, Exp);
2159   auto Not = B.buildNot(S64, Shr);
2160   auto Tmp0 = B.buildAnd(S64, Src, Not);
2161   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2162 
2163   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2164   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2165 
2166   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2167   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 bool AMDGPULegalizerInfo::legalizeITOFP(
2173   MachineInstr &MI, MachineRegisterInfo &MRI,
2174   MachineIRBuilder &B, bool Signed) const {
2175 
2176   Register Dst = MI.getOperand(0).getReg();
2177   Register Src = MI.getOperand(1).getReg();
2178 
2179   const LLT S64 = LLT::scalar(64);
2180   const LLT S32 = LLT::scalar(32);
2181 
2182   assert(MRI.getType(Src) == S64);
2183 
2184   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2185   auto ThirtyTwo = B.buildConstant(S32, 32);
2186 
2187   if (MRI.getType(Dst) == S64) {
2188     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2189                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2190 
2191     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2192     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2193                      .addUse(CvtHi.getReg(0))
2194                      .addUse(ThirtyTwo.getReg(0));
2195 
2196     // TODO: Should this propagate fast-math-flags?
2197     B.buildFAdd(Dst, LdExp, CvtLo);
2198     MI.eraseFromParent();
2199     return true;
2200   }
2201 
2202   assert(MRI.getType(Dst) == S32);
2203 
2204   auto One = B.buildConstant(S32, 1);
2205 
2206   MachineInstrBuilder ShAmt;
2207   if (Signed) {
2208     auto ThirtyOne = B.buildConstant(S32, 31);
2209     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2210     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2211     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2212     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2213                                /*HasSideEffects=*/false)
2214                   .addUse(Unmerge.getReg(1));
2215     auto LS2 = B.buildSub(S32, LS, One);
2216     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2217   } else
2218     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2219   auto Norm = B.buildShl(S64, Src, ShAmt);
2220   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2221   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2222   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2223   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2224   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2225   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2226                    /*HasSideEffects=*/false)
2227       .addUse(FVal.getReg(0))
2228       .addUse(Scale.getReg(0));
2229   MI.eraseFromParent();
2230   return true;
2231 }
2232 
2233 // TODO: Copied from DAG implementation. Verify logic and document how this
2234 // actually works.
2235 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2236                                         MachineRegisterInfo &MRI,
2237                                         MachineIRBuilder &B,
2238                                         bool Signed) const {
2239 
2240   Register Dst = MI.getOperand(0).getReg();
2241   Register Src = MI.getOperand(1).getReg();
2242 
2243   const LLT S64 = LLT::scalar(64);
2244   const LLT S32 = LLT::scalar(32);
2245 
2246   const LLT SrcLT = MRI.getType(Src);
2247   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2248 
2249   unsigned Flags = MI.getFlags();
2250 
2251   // The basic idea of converting a floating point number into a pair of 32-bit
2252   // integers is illustrated as follows:
2253   //
2254   //     tf := trunc(val);
2255   //    hif := floor(tf * 2^-32);
2256   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2257   //     hi := fptoi(hif);
2258   //     lo := fptoi(lof);
2259   //
2260   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2261   MachineInstrBuilder Sign;
2262   if (Signed && SrcLT == S32) {
2263     // However, a 32-bit floating point number has only 23 bits mantissa and
2264     // it's not enough to hold all the significant bits of `lof` if val is
2265     // negative. To avoid the loss of precision, We need to take the absolute
2266     // value after truncating and flip the result back based on the original
2267     // signedness.
2268     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2269     Trunc = B.buildFAbs(S32, Trunc, Flags);
2270   }
2271   MachineInstrBuilder K0, K1;
2272   if (SrcLT == S64) {
2273     K0 = B.buildFConstant(S64,
2274                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2275     K1 = B.buildFConstant(S64,
2276                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2277   } else {
2278     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2279     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2280   }
2281 
2282   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2283   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2284   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2285 
2286   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2287                                      : B.buildFPTOUI(S32, FloorMul);
2288   auto Lo = B.buildFPTOUI(S32, Fma);
2289 
2290   if (Signed && SrcLT == S32) {
2291     // Flip the result based on the signedness, which is either all 0s or 1s.
2292     Sign = B.buildMerge(S64, {Sign, Sign});
2293     // r := xor({lo, hi}, sign) - sign;
2294     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2295   } else
2296     B.buildMerge(Dst, {Lo, Hi});
2297   MI.eraseFromParent();
2298 
2299   return true;
2300 }
2301 
2302 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2303                                                MachineInstr &MI) const {
2304   MachineFunction &MF = Helper.MIRBuilder.getMF();
2305   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2306 
2307   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2308                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2309 
2310   // With ieee_mode disabled, the instructions have the correct behavior
2311   // already for G_FMINNUM/G_FMAXNUM
2312   if (!MFI->getMode().IEEE)
2313     return !IsIEEEOp;
2314 
2315   if (IsIEEEOp)
2316     return true;
2317 
2318   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2319 }
2320 
2321 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2322   MachineInstr &MI, MachineRegisterInfo &MRI,
2323   MachineIRBuilder &B) const {
2324   // TODO: Should move some of this into LegalizerHelper.
2325 
2326   // TODO: Promote dynamic indexing of s16 to s32
2327 
2328   // FIXME: Artifact combiner probably should have replaced the truncated
2329   // constant before this, so we shouldn't need
2330   // getIConstantVRegValWithLookThrough.
2331   Optional<ValueAndVReg> MaybeIdxVal =
2332       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2333   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2334     return true;
2335   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2336 
2337   Register Dst = MI.getOperand(0).getReg();
2338   Register Vec = MI.getOperand(1).getReg();
2339 
2340   LLT VecTy = MRI.getType(Vec);
2341   LLT EltTy = VecTy.getElementType();
2342   assert(EltTy == MRI.getType(Dst));
2343 
2344   if (IdxVal < VecTy.getNumElements()) {
2345     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2346     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2347   } else {
2348     B.buildUndef(Dst);
2349   }
2350 
2351   MI.eraseFromParent();
2352   return true;
2353 }
2354 
2355 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2356   MachineInstr &MI, MachineRegisterInfo &MRI,
2357   MachineIRBuilder &B) const {
2358   // TODO: Should move some of this into LegalizerHelper.
2359 
2360   // TODO: Promote dynamic indexing of s16 to s32
2361 
2362   // FIXME: Artifact combiner probably should have replaced the truncated
2363   // constant before this, so we shouldn't need
2364   // getIConstantVRegValWithLookThrough.
2365   Optional<ValueAndVReg> MaybeIdxVal =
2366       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2367   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2368     return true;
2369 
2370   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2371   Register Dst = MI.getOperand(0).getReg();
2372   Register Vec = MI.getOperand(1).getReg();
2373   Register Ins = MI.getOperand(2).getReg();
2374 
2375   LLT VecTy = MRI.getType(Vec);
2376   LLT EltTy = VecTy.getElementType();
2377   assert(EltTy == MRI.getType(Ins));
2378   (void)Ins;
2379 
2380   unsigned NumElts = VecTy.getNumElements();
2381   if (IdxVal < NumElts) {
2382     SmallVector<Register, 8> SrcRegs;
2383     for (unsigned i = 0; i < NumElts; ++i)
2384       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2385     B.buildUnmerge(SrcRegs, Vec);
2386 
2387     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2388     B.buildMerge(Dst, SrcRegs);
2389   } else {
2390     B.buildUndef(Dst);
2391   }
2392 
2393   MI.eraseFromParent();
2394   return true;
2395 }
2396 
2397 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2398   MachineInstr &MI, MachineRegisterInfo &MRI,
2399   MachineIRBuilder &B) const {
2400   const LLT V2S16 = LLT::fixed_vector(2, 16);
2401 
2402   Register Dst = MI.getOperand(0).getReg();
2403   Register Src0 = MI.getOperand(1).getReg();
2404   LLT DstTy = MRI.getType(Dst);
2405   LLT SrcTy = MRI.getType(Src0);
2406 
2407   if (SrcTy == V2S16 && DstTy == V2S16 &&
2408       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2409     return true;
2410 
2411   MachineIRBuilder HelperBuilder(MI);
2412   GISelObserverWrapper DummyObserver;
2413   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2414   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2415 }
2416 
2417 bool AMDGPULegalizerInfo::legalizeSinCos(
2418   MachineInstr &MI, MachineRegisterInfo &MRI,
2419   MachineIRBuilder &B) const {
2420 
2421   Register DstReg = MI.getOperand(0).getReg();
2422   Register SrcReg = MI.getOperand(1).getReg();
2423   LLT Ty = MRI.getType(DstReg);
2424   unsigned Flags = MI.getFlags();
2425 
2426   Register TrigVal;
2427   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2428   if (ST.hasTrigReducedRange()) {
2429     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2430     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2431       .addUse(MulVal.getReg(0))
2432       .setMIFlags(Flags).getReg(0);
2433   } else
2434     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2435 
2436   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2437     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2438   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2439     .addUse(TrigVal)
2440     .setMIFlags(Flags);
2441   MI.eraseFromParent();
2442   return true;
2443 }
2444 
2445 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2446                                                   MachineIRBuilder &B,
2447                                                   const GlobalValue *GV,
2448                                                   int64_t Offset,
2449                                                   unsigned GAFlags) const {
2450   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2451   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2452   // to the following code sequence:
2453   //
2454   // For constant address space:
2455   //   s_getpc_b64 s[0:1]
2456   //   s_add_u32 s0, s0, $symbol
2457   //   s_addc_u32 s1, s1, 0
2458   //
2459   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2460   //   a fixup or relocation is emitted to replace $symbol with a literal
2461   //   constant, which is a pc-relative offset from the encoding of the $symbol
2462   //   operand to the global variable.
2463   //
2464   // For global address space:
2465   //   s_getpc_b64 s[0:1]
2466   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2467   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2468   //
2469   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2470   //   fixups or relocations are emitted to replace $symbol@*@lo and
2471   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2472   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2473   //   operand to the global variable.
2474   //
2475   // What we want here is an offset from the value returned by s_getpc
2476   // (which is the address of the s_add_u32 instruction) to the global
2477   // variable, but since the encoding of $symbol starts 4 bytes after the start
2478   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2479   // small. This requires us to add 4 to the global variable offset in order to
2480   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2481   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2482   // instruction.
2483 
2484   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2485 
2486   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2487     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2488 
2489   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2490     .addDef(PCReg);
2491 
2492   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2493   if (GAFlags == SIInstrInfo::MO_NONE)
2494     MIB.addImm(0);
2495   else
2496     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2497 
2498   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2499 
2500   if (PtrTy.getSizeInBits() == 32)
2501     B.buildExtract(DstReg, PCReg, 0);
2502   return true;
2503  }
2504 
2505 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2506   MachineInstr &MI, MachineRegisterInfo &MRI,
2507   MachineIRBuilder &B) const {
2508   Register DstReg = MI.getOperand(0).getReg();
2509   LLT Ty = MRI.getType(DstReg);
2510   unsigned AS = Ty.getAddressSpace();
2511 
2512   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2513   MachineFunction &MF = B.getMF();
2514   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2515 
2516   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2517     if (!MFI->isModuleEntryFunction() &&
2518         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2519       const Function &Fn = MF.getFunction();
2520       DiagnosticInfoUnsupported BadLDSDecl(
2521         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2522         DS_Warning);
2523       Fn.getContext().diagnose(BadLDSDecl);
2524 
2525       // We currently don't have a way to correctly allocate LDS objects that
2526       // aren't directly associated with a kernel. We do force inlining of
2527       // functions that use local objects. However, if these dead functions are
2528       // not eliminated, we don't want a compile time error. Just emit a warning
2529       // and a trap, since there should be no callable path here.
2530       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2531       B.buildUndef(DstReg);
2532       MI.eraseFromParent();
2533       return true;
2534     }
2535 
2536     // TODO: We could emit code to handle the initialization somewhere.
2537     // We ignore the initializer for now and legalize it to allow selection.
2538     // The initializer will anyway get errored out during assembly emission.
2539     const SITargetLowering *TLI = ST.getTargetLowering();
2540     if (!TLI->shouldUseLDSConstAddress(GV)) {
2541       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2542       return true; // Leave in place;
2543     }
2544 
2545     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2546       Type *Ty = GV->getValueType();
2547       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2548       // zero-sized type in other languages to declare the dynamic shared
2549       // memory which size is not known at the compile time. They will be
2550       // allocated by the runtime and placed directly after the static
2551       // allocated ones. They all share the same offset.
2552       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2553         // Adjust alignment for that dynamic shared memory array.
2554         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2555         LLT S32 = LLT::scalar(32);
2556         auto Sz =
2557             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2558         B.buildIntToPtr(DstReg, Sz);
2559         MI.eraseFromParent();
2560         return true;
2561       }
2562     }
2563 
2564     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2565                                                    *cast<GlobalVariable>(GV)));
2566     MI.eraseFromParent();
2567     return true;
2568   }
2569 
2570   const SITargetLowering *TLI = ST.getTargetLowering();
2571 
2572   if (TLI->shouldEmitFixup(GV)) {
2573     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2574     MI.eraseFromParent();
2575     return true;
2576   }
2577 
2578   if (TLI->shouldEmitPCReloc(GV)) {
2579     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2580     MI.eraseFromParent();
2581     return true;
2582   }
2583 
2584   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2585   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2586 
2587   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2588   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2589       MachinePointerInfo::getGOT(MF),
2590       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2591           MachineMemOperand::MOInvariant,
2592       LoadTy, Align(8));
2593 
2594   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2595 
2596   if (Ty.getSizeInBits() == 32) {
2597     // Truncate if this is a 32-bit constant address.
2598     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2599     B.buildExtract(DstReg, Load, 0);
2600   } else
2601     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2602 
2603   MI.eraseFromParent();
2604   return true;
2605 }
2606 
2607 static LLT widenToNextPowerOf2(LLT Ty) {
2608   if (Ty.isVector())
2609     return Ty.changeElementCount(
2610         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2611   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2612 }
2613 
2614 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2615                                        MachineInstr &MI) const {
2616   MachineIRBuilder &B = Helper.MIRBuilder;
2617   MachineRegisterInfo &MRI = *B.getMRI();
2618   GISelChangeObserver &Observer = Helper.Observer;
2619 
2620   Register PtrReg = MI.getOperand(1).getReg();
2621   LLT PtrTy = MRI.getType(PtrReg);
2622   unsigned AddrSpace = PtrTy.getAddressSpace();
2623 
2624   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2625     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2626     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2627     Observer.changingInstr(MI);
2628     MI.getOperand(1).setReg(Cast.getReg(0));
2629     Observer.changedInstr(MI);
2630     return true;
2631   }
2632 
2633   if (MI.getOpcode() != AMDGPU::G_LOAD)
2634     return false;
2635 
2636   Register ValReg = MI.getOperand(0).getReg();
2637   LLT ValTy = MRI.getType(ValReg);
2638 
2639   MachineMemOperand *MMO = *MI.memoperands_begin();
2640   const unsigned ValSize = ValTy.getSizeInBits();
2641   const LLT MemTy = MMO->getMemoryType();
2642   const Align MemAlign = MMO->getAlign();
2643   const unsigned MemSize = MemTy.getSizeInBits();
2644   const uint64_t AlignInBits = 8 * MemAlign.value();
2645 
2646   // Widen non-power-of-2 loads to the alignment if needed
2647   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2648     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2649 
2650     // This was already the correct extending load result type, so just adjust
2651     // the memory type.
2652     if (WideMemSize == ValSize) {
2653       MachineFunction &MF = B.getMF();
2654 
2655       MachineMemOperand *WideMMO =
2656           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2657       Observer.changingInstr(MI);
2658       MI.setMemRefs(MF, {WideMMO});
2659       Observer.changedInstr(MI);
2660       return true;
2661     }
2662 
2663     // Don't bother handling edge case that should probably never be produced.
2664     if (ValSize > WideMemSize)
2665       return false;
2666 
2667     LLT WideTy = widenToNextPowerOf2(ValTy);
2668 
2669     Register WideLoad;
2670     if (!WideTy.isVector()) {
2671       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2672       B.buildTrunc(ValReg, WideLoad).getReg(0);
2673     } else {
2674       // Extract the subvector.
2675 
2676       if (isRegisterType(ValTy)) {
2677         // If this a case where G_EXTRACT is legal, use it.
2678         // (e.g. <3 x s32> -> <4 x s32>)
2679         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2680         B.buildExtract(ValReg, WideLoad, 0);
2681       } else {
2682         // For cases where the widened type isn't a nice register value, unmerge
2683         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2684         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2685         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2686       }
2687     }
2688 
2689     MI.eraseFromParent();
2690     return true;
2691   }
2692 
2693   return false;
2694 }
2695 
2696 bool AMDGPULegalizerInfo::legalizeFMad(
2697   MachineInstr &MI, MachineRegisterInfo &MRI,
2698   MachineIRBuilder &B) const {
2699   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2700   assert(Ty.isScalar());
2701 
2702   MachineFunction &MF = B.getMF();
2703   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2704 
2705   // TODO: Always legal with future ftz flag.
2706   // FIXME: Do we need just output?
2707   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2708     return true;
2709   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2710     return true;
2711 
2712   MachineIRBuilder HelperBuilder(MI);
2713   GISelObserverWrapper DummyObserver;
2714   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2715   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2716 }
2717 
2718 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2719   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2720   Register DstReg = MI.getOperand(0).getReg();
2721   Register PtrReg = MI.getOperand(1).getReg();
2722   Register CmpVal = MI.getOperand(2).getReg();
2723   Register NewVal = MI.getOperand(3).getReg();
2724 
2725   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2726          "this should not have been custom lowered");
2727 
2728   LLT ValTy = MRI.getType(CmpVal);
2729   LLT VecTy = LLT::fixed_vector(2, ValTy);
2730 
2731   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2732 
2733   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2734     .addDef(DstReg)
2735     .addUse(PtrReg)
2736     .addUse(PackedVal)
2737     .setMemRefs(MI.memoperands());
2738 
2739   MI.eraseFromParent();
2740   return true;
2741 }
2742 
2743 bool AMDGPULegalizerInfo::legalizeFlog(
2744   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2745   Register Dst = MI.getOperand(0).getReg();
2746   Register Src = MI.getOperand(1).getReg();
2747   LLT Ty = B.getMRI()->getType(Dst);
2748   unsigned Flags = MI.getFlags();
2749 
2750   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2751   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2752 
2753   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2754   MI.eraseFromParent();
2755   return true;
2756 }
2757 
2758 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2759                                        MachineIRBuilder &B) const {
2760   Register Dst = MI.getOperand(0).getReg();
2761   Register Src = MI.getOperand(1).getReg();
2762   unsigned Flags = MI.getFlags();
2763   LLT Ty = B.getMRI()->getType(Dst);
2764 
2765   auto K = B.buildFConstant(Ty, numbers::log2e);
2766   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2767   B.buildFExp2(Dst, Mul, Flags);
2768   MI.eraseFromParent();
2769   return true;
2770 }
2771 
2772 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2773                                        MachineIRBuilder &B) const {
2774   Register Dst = MI.getOperand(0).getReg();
2775   Register Src0 = MI.getOperand(1).getReg();
2776   Register Src1 = MI.getOperand(2).getReg();
2777   unsigned Flags = MI.getFlags();
2778   LLT Ty = B.getMRI()->getType(Dst);
2779   const LLT S16 = LLT::scalar(16);
2780   const LLT S32 = LLT::scalar(32);
2781 
2782   if (Ty == S32) {
2783     auto Log = B.buildFLog2(S32, Src0, Flags);
2784     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2785       .addUse(Log.getReg(0))
2786       .addUse(Src1)
2787       .setMIFlags(Flags);
2788     B.buildFExp2(Dst, Mul, Flags);
2789   } else if (Ty == S16) {
2790     // There's no f16 fmul_legacy, so we need to convert for it.
2791     auto Log = B.buildFLog2(S16, Src0, Flags);
2792     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2793     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2794     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2795       .addUse(Ext0.getReg(0))
2796       .addUse(Ext1.getReg(0))
2797       .setMIFlags(Flags);
2798 
2799     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2800   } else
2801     return false;
2802 
2803   MI.eraseFromParent();
2804   return true;
2805 }
2806 
2807 // Find a source register, ignoring any possible source modifiers.
2808 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2809   Register ModSrc = OrigSrc;
2810   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2811     ModSrc = SrcFNeg->getOperand(1).getReg();
2812     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2813       ModSrc = SrcFAbs->getOperand(1).getReg();
2814   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2815     ModSrc = SrcFAbs->getOperand(1).getReg();
2816   return ModSrc;
2817 }
2818 
2819 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2820                                          MachineRegisterInfo &MRI,
2821                                          MachineIRBuilder &B) const {
2822 
2823   const LLT S1 = LLT::scalar(1);
2824   const LLT S64 = LLT::scalar(64);
2825   Register Dst = MI.getOperand(0).getReg();
2826   Register OrigSrc = MI.getOperand(1).getReg();
2827   unsigned Flags = MI.getFlags();
2828   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2829          "this should not have been custom lowered");
2830 
2831   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2832   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2833   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2834   // V_FRACT bug is:
2835   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2836   //
2837   // Convert floor(x) to (x - fract(x))
2838 
2839   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2840     .addUse(OrigSrc)
2841     .setMIFlags(Flags);
2842 
2843   // Give source modifier matching some assistance before obscuring a foldable
2844   // pattern.
2845 
2846   // TODO: We can avoid the neg on the fract? The input sign to fract
2847   // shouldn't matter?
2848   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2849 
2850   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2851 
2852   Register Min = MRI.createGenericVirtualRegister(S64);
2853 
2854   // We don't need to concern ourselves with the snan handling difference, so
2855   // use the one which will directly select.
2856   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2857   if (MFI->getMode().IEEE)
2858     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2859   else
2860     B.buildFMinNum(Min, Fract, Const, Flags);
2861 
2862   Register CorrectedFract = Min;
2863   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2864     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2865     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2866   }
2867 
2868   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2869   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2870 
2871   MI.eraseFromParent();
2872   return true;
2873 }
2874 
2875 // Turn an illegal packed v2s16 build vector into bit operations.
2876 // TODO: This should probably be a bitcast action in LegalizerHelper.
2877 bool AMDGPULegalizerInfo::legalizeBuildVector(
2878   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2879   Register Dst = MI.getOperand(0).getReg();
2880   const LLT S32 = LLT::scalar(32);
2881   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2882 
2883   Register Src0 = MI.getOperand(1).getReg();
2884   Register Src1 = MI.getOperand(2).getReg();
2885   assert(MRI.getType(Src0) == LLT::scalar(16));
2886 
2887   auto Merge = B.buildMerge(S32, {Src0, Src1});
2888   B.buildBitcast(Dst, Merge);
2889 
2890   MI.eraseFromParent();
2891   return true;
2892 }
2893 
2894 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2895 //
2896 // Source and accumulation registers must all be 32-bits.
2897 //
2898 // TODO: When the multiply is uniform, we should produce a code sequence
2899 // that is better suited to instruction selection on the SALU. Instead of
2900 // the outer loop going over parts of the result, the outer loop should go
2901 // over parts of one of the factors. This should result in instruction
2902 // selection that makes full use of S_ADDC_U32 instructions.
2903 void AMDGPULegalizerInfo::buildMultiply(
2904     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2905     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2906     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2907   // Use (possibly empty) vectors of S1 registers to represent the set of
2908   // carries from one pair of positions to the next.
2909   using Carry = SmallVector<Register, 2>;
2910 
2911   MachineIRBuilder &B = Helper.MIRBuilder;
2912 
2913   const LLT S1 = LLT::scalar(1);
2914   const LLT S32 = LLT::scalar(32);
2915   const LLT S64 = LLT::scalar(64);
2916 
2917   Register Zero32;
2918   Register Zero64;
2919 
2920   auto getZero32 = [&]() -> Register {
2921     if (!Zero32)
2922       Zero32 = B.buildConstant(S32, 0).getReg(0);
2923     return Zero32;
2924   };
2925   auto getZero64 = [&]() -> Register {
2926     if (!Zero64)
2927       Zero64 = B.buildConstant(S64, 0).getReg(0);
2928     return Zero64;
2929   };
2930 
2931   // Merge the given carries into the 32-bit LocalAccum, which is modified
2932   // in-place.
2933   //
2934   // Returns the carry-out, which is a single S1 register or null.
2935   auto mergeCarry =
2936       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2937         if (CarryIn.empty())
2938           return Register();
2939 
2940         bool HaveCarryOut = true;
2941         Register CarryAccum;
2942         if (CarryIn.size() == 1) {
2943           if (!LocalAccum) {
2944             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2945             return Register();
2946           }
2947 
2948           CarryAccum = getZero32();
2949         } else {
2950           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2951           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2952             CarryAccum =
2953                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2954                     .getReg(0);
2955           }
2956 
2957           if (!LocalAccum) {
2958             LocalAccum = getZero32();
2959             HaveCarryOut = false;
2960           }
2961         }
2962 
2963         auto Add =
2964             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2965         LocalAccum = Add.getReg(0);
2966         return HaveCarryOut ? Add.getReg(1) : Register();
2967       };
2968 
2969   // Build a multiply-add chain to compute
2970   //
2971   //   LocalAccum + (partial products at DstIndex)
2972   //       + (opportunistic subset of CarryIn)
2973   //
2974   // LocalAccum is an array of one or two 32-bit registers that are updated
2975   // in-place. The incoming registers may be null.
2976   //
2977   // In some edge cases, carry-ins can be consumed "for free". In that case,
2978   // the consumed carry bits are removed from CarryIn in-place.
2979   auto buildMadChain =
2980       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2981           -> Carry {
2982         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2983                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2984 
2985         Carry CarryOut;
2986         unsigned j0 = 0;
2987 
2988         // Use plain 32-bit multiplication for the most significant part of the
2989         // result by default.
2990         if (LocalAccum.size() == 1 &&
2991             (!UsePartialMad64_32 || !CarryIn.empty())) {
2992           do {
2993             unsigned j1 = DstIndex - j0;
2994             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
2995             if (!LocalAccum[0]) {
2996               LocalAccum[0] = Mul.getReg(0);
2997             } else {
2998               if (CarryIn.empty()) {
2999                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3000               } else {
3001                 LocalAccum[0] =
3002                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3003                         .getReg(0);
3004                 CarryIn.pop_back();
3005               }
3006             }
3007             ++j0;
3008           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3009         }
3010 
3011         // Build full 64-bit multiplies.
3012         if (j0 <= DstIndex) {
3013           bool HaveSmallAccum = false;
3014           Register Tmp;
3015 
3016           if (LocalAccum[0]) {
3017             if (LocalAccum.size() == 1) {
3018               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3019               HaveSmallAccum = true;
3020             } else if (LocalAccum[1]) {
3021               Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3022               HaveSmallAccum = false;
3023             } else {
3024               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3025               HaveSmallAccum = true;
3026             }
3027           } else {
3028             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3029             Tmp = getZero64();
3030             HaveSmallAccum = true;
3031           }
3032 
3033           do {
3034             unsigned j1 = DstIndex - j0;
3035             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3036                                     {Src0[j0], Src1[j1], Tmp});
3037             Tmp = Mad.getReg(0);
3038             if (!HaveSmallAccum)
3039               CarryOut.push_back(Mad.getReg(1));
3040             HaveSmallAccum = false;
3041             ++j0;
3042           } while (j0 <= DstIndex);
3043 
3044           auto Unmerge = B.buildUnmerge(S32, Tmp);
3045           LocalAccum[0] = Unmerge.getReg(0);
3046           if (LocalAccum.size() > 1)
3047             LocalAccum[1] = Unmerge.getReg(1);
3048         }
3049 
3050         return CarryOut;
3051       };
3052 
3053   // Outer multiply loop, iterating over destination parts from least
3054   // significant to most significant parts.
3055   //
3056   // The columns of the following diagram correspond to the destination parts
3057   // affected by one iteration of the outer loop (ignoring boundary
3058   // conditions).
3059   //
3060   //   Dest index relative to 2 * i:      1 0 -1
3061   //                                      ------
3062   //   Carries from previous iteration:     e o
3063   //   Even-aligned partial product sum:  E E .
3064   //   Odd-aligned partial product sum:     O O
3065   //
3066   // 'o' is OddCarry, 'e' is EvenCarry.
3067   // EE and OO are computed from partial products via buildMadChain and use
3068   // accumulation where possible and appropriate.
3069   //
3070   Register SeparateOddCarry;
3071   Carry EvenCarry;
3072   Carry OddCarry;
3073 
3074   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3075     Carry OddCarryIn = std::move(OddCarry);
3076     Carry EvenCarryIn = std::move(EvenCarry);
3077     OddCarry.clear();
3078     EvenCarry.clear();
3079 
3080     // Partial products at offset 2 * i.
3081     if (2 * i < Accum.size()) {
3082       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3083       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3084     }
3085 
3086     // Partial products at offset 2 * i - 1.
3087     if (i > 0) {
3088       if (!SeparateOddAlignedProducts) {
3089         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3090         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3091       } else {
3092         bool IsHighest = 2 * i >= Accum.size();
3093         Register SeparateOddOut[2];
3094         auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3095                               .take_front(IsHighest ? 1 : 2);
3096         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3097 
3098         MachineInstr *Lo;
3099 
3100         if (i == 1) {
3101           if (!IsHighest)
3102             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3103           else
3104             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3105         } else {
3106           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3107                             SeparateOddCarry);
3108         }
3109         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3110 
3111         if (!IsHighest) {
3112           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3113                                 Lo->getOperand(1).getReg());
3114           Accum[2 * i] = Hi.getReg(0);
3115           SeparateOddCarry = Hi.getReg(1);
3116         }
3117       }
3118     }
3119 
3120     // Add in the carries from the previous iteration
3121     if (i > 0) {
3122       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3123         EvenCarryIn.push_back(CarryOut);
3124 
3125       if (2 * i < Accum.size()) {
3126         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3127           OddCarry.push_back(CarryOut);
3128       }
3129     }
3130   }
3131 }
3132 
3133 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3134 //
3135 // TODO: If the multiply is followed by an addition, we should attempt to
3136 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3137 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3138                                       MachineInstr &MI) const {
3139   assert(ST.hasMad64_32());
3140   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3141 
3142   MachineIRBuilder &B = Helper.MIRBuilder;
3143   MachineRegisterInfo &MRI = *B.getMRI();
3144 
3145   Register DstReg = MI.getOperand(0).getReg();
3146   Register Src0 = MI.getOperand(1).getReg();
3147   Register Src1 = MI.getOperand(2).getReg();
3148 
3149   LLT Ty = MRI.getType(DstReg);
3150   assert(Ty.isScalar());
3151 
3152   unsigned Size = Ty.getSizeInBits();
3153   unsigned NumParts = Size / 32;
3154   assert((Size % 32) == 0);
3155   assert(NumParts >= 2);
3156 
3157   // Whether to use MAD_64_32 for partial products whose high half is
3158   // discarded. This avoids some ADD instructions but risks false dependency
3159   // stalls on some subtargets in some cases.
3160   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3161 
3162   // Whether to compute odd-aligned partial products separately. This is
3163   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3164   // in an even-aligned VGPR.
3165   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3166 
3167   LLT S32 = LLT::scalar(32);
3168   SmallVector<Register, 2> Src0Parts, Src1Parts;
3169   for (unsigned i = 0; i < NumParts; ++i) {
3170     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3171     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3172   }
3173   B.buildUnmerge(Src0Parts, Src0);
3174   B.buildUnmerge(Src1Parts, Src1);
3175 
3176   SmallVector<Register, 2> AccumRegs(NumParts);
3177   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3178                 SeparateOddAlignedProducts);
3179 
3180   B.buildMerge(DstReg, AccumRegs);
3181   MI.eraseFromParent();
3182   return true;
3183 
3184 }
3185 
3186 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3187 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3188 // case with a single min instruction instead of a compare+select.
3189 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3190                                             MachineRegisterInfo &MRI,
3191                                             MachineIRBuilder &B) const {
3192   Register Dst = MI.getOperand(0).getReg();
3193   Register Src = MI.getOperand(1).getReg();
3194   LLT DstTy = MRI.getType(Dst);
3195   LLT SrcTy = MRI.getType(Src);
3196 
3197   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3198                         ? AMDGPU::G_AMDGPU_FFBH_U32
3199                         : AMDGPU::G_AMDGPU_FFBL_B32;
3200   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3201   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3202 
3203   MI.eraseFromParent();
3204   return true;
3205 }
3206 
3207 // Check that this is a G_XOR x, -1
3208 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3209   if (MI.getOpcode() != TargetOpcode::G_XOR)
3210     return false;
3211   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3212   return ConstVal && *ConstVal == -1;
3213 }
3214 
3215 // Return the use branch instruction, otherwise null if the usage is invalid.
3216 static MachineInstr *
3217 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3218                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3219   Register CondDef = MI.getOperand(0).getReg();
3220   if (!MRI.hasOneNonDBGUse(CondDef))
3221     return nullptr;
3222 
3223   MachineBasicBlock *Parent = MI.getParent();
3224   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3225 
3226   if (isNot(MRI, *UseMI)) {
3227     Register NegatedCond = UseMI->getOperand(0).getReg();
3228     if (!MRI.hasOneNonDBGUse(NegatedCond))
3229       return nullptr;
3230 
3231     // We're deleting the def of this value, so we need to remove it.
3232     eraseInstr(*UseMI, MRI);
3233 
3234     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3235     Negated = true;
3236   }
3237 
3238   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3239     return nullptr;
3240 
3241   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3242   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3243   if (Next == Parent->end()) {
3244     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3245     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3246       return nullptr;
3247     UncondBrTarget = &*NextMBB;
3248   } else {
3249     if (Next->getOpcode() != AMDGPU::G_BR)
3250       return nullptr;
3251     Br = &*Next;
3252     UncondBrTarget = Br->getOperand(0).getMBB();
3253   }
3254 
3255   return UseMI;
3256 }
3257 
3258 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3259                                          const ArgDescriptor *Arg,
3260                                          const TargetRegisterClass *ArgRC,
3261                                          LLT ArgTy) const {
3262   MCRegister SrcReg = Arg->getRegister();
3263   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3264   assert(DstReg.isVirtual() && "Virtual register expected");
3265 
3266   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3267                                              *ArgRC, B.getDebugLoc(), ArgTy);
3268   if (Arg->isMasked()) {
3269     // TODO: Should we try to emit this once in the entry block?
3270     const LLT S32 = LLT::scalar(32);
3271     const unsigned Mask = Arg->getMask();
3272     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3273 
3274     Register AndMaskSrc = LiveIn;
3275 
3276     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3277     // 0.
3278     if (Shift != 0) {
3279       auto ShiftAmt = B.buildConstant(S32, Shift);
3280       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3281     }
3282 
3283     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3284   } else {
3285     B.buildCopy(DstReg, LiveIn);
3286   }
3287 
3288   return true;
3289 }
3290 
3291 bool AMDGPULegalizerInfo::loadInputValue(
3292     Register DstReg, MachineIRBuilder &B,
3293     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3294   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3295   const ArgDescriptor *Arg;
3296   const TargetRegisterClass *ArgRC;
3297   LLT ArgTy;
3298   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3299 
3300   if (!Arg) {
3301     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3302       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3303       // case the pointer argument may be missing and we use null.
3304       B.buildConstant(DstReg, 0);
3305       return true;
3306     }
3307 
3308     // It's undefined behavior if a function marked with the amdgpu-no-*
3309     // attributes uses the corresponding intrinsic.
3310     B.buildUndef(DstReg);
3311     return true;
3312   }
3313 
3314   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3315     return false; // TODO: Handle these
3316   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3317 }
3318 
3319 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3320     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3321     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3322   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3323     return false;
3324 
3325   MI.eraseFromParent();
3326   return true;
3327 }
3328 
3329 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3330                                 int64_t C) {
3331   B.buildConstant(MI.getOperand(0).getReg(), C);
3332   MI.eraseFromParent();
3333   return true;
3334 }
3335 
3336 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3337     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3338     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3339   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3340   if (MaxID == 0)
3341     return replaceWithConstant(B, MI, 0);
3342 
3343   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3344   const ArgDescriptor *Arg;
3345   const TargetRegisterClass *ArgRC;
3346   LLT ArgTy;
3347   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3348 
3349   Register DstReg = MI.getOperand(0).getReg();
3350   if (!Arg) {
3351     // It's undefined behavior if a function marked with the amdgpu-no-*
3352     // attributes uses the corresponding intrinsic.
3353     B.buildUndef(DstReg);
3354     MI.eraseFromParent();
3355     return true;
3356   }
3357 
3358   if (Arg->isMasked()) {
3359     // Don't bother inserting AssertZext for packed IDs since we're emitting the
3360     // masking operations anyway.
3361     //
3362     // TODO: We could assert the top bit is 0 for the source copy.
3363     if (!loadInputValue(DstReg, B, ArgType))
3364       return false;
3365   } else {
3366     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3367     if (!loadInputValue(TmpReg, B, ArgType))
3368       return false;
3369     B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3370   }
3371 
3372   MI.eraseFromParent();
3373   return true;
3374 }
3375 
3376 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3377                                                      int64_t Offset) const {
3378   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3379   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3380 
3381   // TODO: If we passed in the base kernel offset we could have a better
3382   // alignment than 4, but we don't really need it.
3383   if (!loadInputValue(KernArgReg, B,
3384                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3385     llvm_unreachable("failed to find kernarg segment ptr");
3386 
3387   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3388   // TODO: Should get nuw
3389   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3390 }
3391 
3392 /// Legalize a value that's loaded from kernel arguments. This is only used by
3393 /// legacy intrinsics.
3394 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3395                                                       MachineIRBuilder &B,
3396                                                       uint64_t Offset,
3397                                                       Align Alignment) const {
3398   Register DstReg = MI.getOperand(0).getReg();
3399 
3400   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3401          "unexpected kernarg parameter type");
3402 
3403   Register Ptr = getKernargParameterPtr(B, Offset);
3404   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3405   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3406               MachineMemOperand::MODereferenceable |
3407                   MachineMemOperand::MOInvariant);
3408   MI.eraseFromParent();
3409   return true;
3410 }
3411 
3412 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3413                                        MachineRegisterInfo &MRI,
3414                                        MachineIRBuilder &B) const {
3415   Register Dst = MI.getOperand(0).getReg();
3416   LLT DstTy = MRI.getType(Dst);
3417   LLT S16 = LLT::scalar(16);
3418   LLT S32 = LLT::scalar(32);
3419   LLT S64 = LLT::scalar(64);
3420 
3421   if (DstTy == S16)
3422     return legalizeFDIV16(MI, MRI, B);
3423   if (DstTy == S32)
3424     return legalizeFDIV32(MI, MRI, B);
3425   if (DstTy == S64)
3426     return legalizeFDIV64(MI, MRI, B);
3427 
3428   return false;
3429 }
3430 
3431 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3432                                                         Register DstDivReg,
3433                                                         Register DstRemReg,
3434                                                         Register X,
3435                                                         Register Y) const {
3436   const LLT S1 = LLT::scalar(1);
3437   const LLT S32 = LLT::scalar(32);
3438 
3439   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3440   // algorithm used here.
3441 
3442   // Initial estimate of inv(y).
3443   auto FloatY = B.buildUITOFP(S32, Y);
3444   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3445   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3446   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3447   auto Z = B.buildFPTOUI(S32, ScaledY);
3448 
3449   // One round of UNR.
3450   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3451   auto NegYZ = B.buildMul(S32, NegY, Z);
3452   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3453 
3454   // Quotient/remainder estimate.
3455   auto Q = B.buildUMulH(S32, X, Z);
3456   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3457 
3458   // First quotient/remainder refinement.
3459   auto One = B.buildConstant(S32, 1);
3460   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3461   if (DstDivReg)
3462     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3463   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3464 
3465   // Second quotient/remainder refinement.
3466   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3467   if (DstDivReg)
3468     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3469 
3470   if (DstRemReg)
3471     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3472 }
3473 
3474 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3475 //
3476 // Return lo, hi of result
3477 //
3478 // %cvt.lo = G_UITOFP Val.lo
3479 // %cvt.hi = G_UITOFP Val.hi
3480 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3481 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3482 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3483 // %mul2 = G_FMUL %mul1, 2**(-32)
3484 // %trunc = G_INTRINSIC_TRUNC %mul2
3485 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3486 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3487 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3488                                                        Register Val) {
3489   const LLT S32 = LLT::scalar(32);
3490   auto Unmerge = B.buildUnmerge(S32, Val);
3491 
3492   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3493   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3494 
3495   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3496                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3497 
3498   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3499   auto Mul1 =
3500       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3501 
3502   // 2**(-32)
3503   auto Mul2 =
3504       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3505   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3506 
3507   // -(2**32)
3508   auto Mad2 = B.buildFMAD(S32, Trunc,
3509                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3510 
3511   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3512   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3513 
3514   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3515 }
3516 
3517 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3518                                                         Register DstDivReg,
3519                                                         Register DstRemReg,
3520                                                         Register Numer,
3521                                                         Register Denom) const {
3522   const LLT S32 = LLT::scalar(32);
3523   const LLT S64 = LLT::scalar(64);
3524   const LLT S1 = LLT::scalar(1);
3525   Register RcpLo, RcpHi;
3526 
3527   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3528 
3529   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3530 
3531   auto Zero64 = B.buildConstant(S64, 0);
3532   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3533 
3534   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3535   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3536 
3537   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3538   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3539   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3540 
3541   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3542   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3543   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3544 
3545   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3546   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3547   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3548   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3549   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3550 
3551   auto Zero32 = B.buildConstant(S32, 0);
3552   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3553   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3554   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3555 
3556   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3557   Register NumerLo = UnmergeNumer.getReg(0);
3558   Register NumerHi = UnmergeNumer.getReg(1);
3559 
3560   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3561   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3562   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3563   Register Mul3_Lo = UnmergeMul3.getReg(0);
3564   Register Mul3_Hi = UnmergeMul3.getReg(1);
3565   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3566   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3567   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3568   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3569 
3570   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3571   Register DenomLo = UnmergeDenom.getReg(0);
3572   Register DenomHi = UnmergeDenom.getReg(1);
3573 
3574   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3575   auto C1 = B.buildSExt(S32, CmpHi);
3576 
3577   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3578   auto C2 = B.buildSExt(S32, CmpLo);
3579 
3580   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3581   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3582 
3583   // TODO: Here and below portions of the code can be enclosed into if/endif.
3584   // Currently control flow is unconditional and we have 4 selects after
3585   // potential endif to substitute PHIs.
3586 
3587   // if C3 != 0 ...
3588   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3589   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3590   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3591   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3592 
3593   auto One64 = B.buildConstant(S64, 1);
3594   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3595 
3596   auto C4 =
3597       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3598   auto C5 =
3599       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3600   auto C6 = B.buildSelect(
3601       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3602 
3603   // if (C6 != 0)
3604   auto Add4 = B.buildAdd(S64, Add3, One64);
3605   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3606 
3607   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3608   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3609   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3610 
3611   // endif C6
3612   // endif C3
3613 
3614   if (DstDivReg) {
3615     auto Sel1 = B.buildSelect(
3616         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3617     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3618                   Sel1, MulHi3);
3619   }
3620 
3621   if (DstRemReg) {
3622     auto Sel2 = B.buildSelect(
3623         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3624     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3625                   Sel2, Sub1);
3626   }
3627 }
3628 
3629 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3630                                                   MachineRegisterInfo &MRI,
3631                                                   MachineIRBuilder &B) const {
3632   Register DstDivReg, DstRemReg;
3633   switch (MI.getOpcode()) {
3634   default:
3635     llvm_unreachable("Unexpected opcode!");
3636   case AMDGPU::G_UDIV: {
3637     DstDivReg = MI.getOperand(0).getReg();
3638     break;
3639   }
3640   case AMDGPU::G_UREM: {
3641     DstRemReg = MI.getOperand(0).getReg();
3642     break;
3643   }
3644   case AMDGPU::G_UDIVREM: {
3645     DstDivReg = MI.getOperand(0).getReg();
3646     DstRemReg = MI.getOperand(1).getReg();
3647     break;
3648   }
3649   }
3650 
3651   const LLT S64 = LLT::scalar(64);
3652   const LLT S32 = LLT::scalar(32);
3653   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3654   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3655   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3656   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3657 
3658   if (Ty == S32)
3659     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3660   else if (Ty == S64)
3661     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3662   else
3663     return false;
3664 
3665   MI.eraseFromParent();
3666   return true;
3667 }
3668 
3669 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3670                                                 MachineRegisterInfo &MRI,
3671                                                 MachineIRBuilder &B) const {
3672   const LLT S64 = LLT::scalar(64);
3673   const LLT S32 = LLT::scalar(32);
3674 
3675   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3676   if (Ty != S32 && Ty != S64)
3677     return false;
3678 
3679   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3680   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3681   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3682 
3683   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3684   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3685   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3686 
3687   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3688   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3689 
3690   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3691   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3692 
3693   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3694   switch (MI.getOpcode()) {
3695   default:
3696     llvm_unreachable("Unexpected opcode!");
3697   case AMDGPU::G_SDIV: {
3698     DstDivReg = MI.getOperand(0).getReg();
3699     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3700     break;
3701   }
3702   case AMDGPU::G_SREM: {
3703     DstRemReg = MI.getOperand(0).getReg();
3704     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3705     break;
3706   }
3707   case AMDGPU::G_SDIVREM: {
3708     DstDivReg = MI.getOperand(0).getReg();
3709     DstRemReg = MI.getOperand(1).getReg();
3710     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3712     break;
3713   }
3714   }
3715 
3716   if (Ty == S32)
3717     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3718   else
3719     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3720 
3721   if (DstDivReg) {
3722     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3723     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3724     B.buildSub(DstDivReg, SignXor, Sign);
3725   }
3726 
3727   if (DstRemReg) {
3728     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3729     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3730     B.buildSub(DstRemReg, SignXor, Sign);
3731   }
3732 
3733   MI.eraseFromParent();
3734   return true;
3735 }
3736 
3737 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3738                                                  MachineRegisterInfo &MRI,
3739                                                  MachineIRBuilder &B) const {
3740   Register Res = MI.getOperand(0).getReg();
3741   Register LHS = MI.getOperand(1).getReg();
3742   Register RHS = MI.getOperand(2).getReg();
3743   uint16_t Flags = MI.getFlags();
3744   LLT ResTy = MRI.getType(Res);
3745 
3746   const MachineFunction &MF = B.getMF();
3747   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3748                             MI.getFlag(MachineInstr::FmAfn);
3749 
3750   if (!AllowInaccurateRcp)
3751     return false;
3752 
3753   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3754     // 1 / x -> RCP(x)
3755     if (CLHS->isExactlyValue(1.0)) {
3756       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3757         .addUse(RHS)
3758         .setMIFlags(Flags);
3759 
3760       MI.eraseFromParent();
3761       return true;
3762     }
3763 
3764     // -1 / x -> RCP( FNEG(x) )
3765     if (CLHS->isExactlyValue(-1.0)) {
3766       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3767       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3768         .addUse(FNeg.getReg(0))
3769         .setMIFlags(Flags);
3770 
3771       MI.eraseFromParent();
3772       return true;
3773     }
3774   }
3775 
3776   // x / y -> x * (1.0 / y)
3777   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3778     .addUse(RHS)
3779     .setMIFlags(Flags);
3780   B.buildFMul(Res, LHS, RCP, Flags);
3781 
3782   MI.eraseFromParent();
3783   return true;
3784 }
3785 
3786 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3787                                                    MachineRegisterInfo &MRI,
3788                                                    MachineIRBuilder &B) const {
3789   Register Res = MI.getOperand(0).getReg();
3790   Register X = MI.getOperand(1).getReg();
3791   Register Y = MI.getOperand(2).getReg();
3792   uint16_t Flags = MI.getFlags();
3793   LLT ResTy = MRI.getType(Res);
3794 
3795   const MachineFunction &MF = B.getMF();
3796   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3797                             MI.getFlag(MachineInstr::FmAfn);
3798 
3799   if (!AllowInaccurateRcp)
3800     return false;
3801 
3802   auto NegY = B.buildFNeg(ResTy, Y);
3803   auto One = B.buildFConstant(ResTy, 1.0);
3804 
3805   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3806     .addUse(Y)
3807     .setMIFlags(Flags);
3808 
3809   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3810   R = B.buildFMA(ResTy, Tmp0, R, R);
3811 
3812   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3813   R = B.buildFMA(ResTy, Tmp1, R, R);
3814 
3815   auto Ret = B.buildFMul(ResTy, X, R);
3816   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3817 
3818   B.buildFMA(Res, Tmp2, R, Ret);
3819   MI.eraseFromParent();
3820   return true;
3821 }
3822 
3823 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3824                                          MachineRegisterInfo &MRI,
3825                                          MachineIRBuilder &B) const {
3826   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3827     return true;
3828 
3829   Register Res = MI.getOperand(0).getReg();
3830   Register LHS = MI.getOperand(1).getReg();
3831   Register RHS = MI.getOperand(2).getReg();
3832 
3833   uint16_t Flags = MI.getFlags();
3834 
3835   LLT S16 = LLT::scalar(16);
3836   LLT S32 = LLT::scalar(32);
3837 
3838   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3839   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3840 
3841   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3842     .addUse(RHSExt.getReg(0))
3843     .setMIFlags(Flags);
3844 
3845   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3846   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3847 
3848   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3849     .addUse(RDst.getReg(0))
3850     .addUse(RHS)
3851     .addUse(LHS)
3852     .setMIFlags(Flags);
3853 
3854   MI.eraseFromParent();
3855   return true;
3856 }
3857 
3858 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3859 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3860 static void toggleSPDenormMode(bool Enable,
3861                                MachineIRBuilder &B,
3862                                const GCNSubtarget &ST,
3863                                AMDGPU::SIModeRegisterDefaults Mode) {
3864   // Set SP denorm mode to this value.
3865   unsigned SPDenormMode =
3866     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3867 
3868   if (ST.hasDenormModeInst()) {
3869     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3870     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3871 
3872     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3873     B.buildInstr(AMDGPU::S_DENORM_MODE)
3874       .addImm(NewDenormModeValue);
3875 
3876   } else {
3877     // Select FP32 bit field in mode register.
3878     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3879                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3880                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3881 
3882     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3883       .addImm(SPDenormMode)
3884       .addImm(SPDenormModeBitField);
3885   }
3886 }
3887 
3888 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3889                                          MachineRegisterInfo &MRI,
3890                                          MachineIRBuilder &B) const {
3891   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3892     return true;
3893 
3894   Register Res = MI.getOperand(0).getReg();
3895   Register LHS = MI.getOperand(1).getReg();
3896   Register RHS = MI.getOperand(2).getReg();
3897   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3898   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3899 
3900   uint16_t Flags = MI.getFlags();
3901 
3902   LLT S32 = LLT::scalar(32);
3903   LLT S1 = LLT::scalar(1);
3904 
3905   auto One = B.buildFConstant(S32, 1.0f);
3906 
3907   auto DenominatorScaled =
3908     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3909       .addUse(LHS)
3910       .addUse(RHS)
3911       .addImm(0)
3912       .setMIFlags(Flags);
3913   auto NumeratorScaled =
3914     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3915       .addUse(LHS)
3916       .addUse(RHS)
3917       .addImm(1)
3918       .setMIFlags(Flags);
3919 
3920   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3921     .addUse(DenominatorScaled.getReg(0))
3922     .setMIFlags(Flags);
3923   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3924 
3925   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3926   // aren't modeled as reading it.
3927   if (!Mode.allFP32Denormals())
3928     toggleSPDenormMode(true, B, ST, Mode);
3929 
3930   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3931   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3932   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3933   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3934   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3935   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3936 
3937   if (!Mode.allFP32Denormals())
3938     toggleSPDenormMode(false, B, ST, Mode);
3939 
3940   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3941     .addUse(Fma4.getReg(0))
3942     .addUse(Fma1.getReg(0))
3943     .addUse(Fma3.getReg(0))
3944     .addUse(NumeratorScaled.getReg(1))
3945     .setMIFlags(Flags);
3946 
3947   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3948     .addUse(Fmas.getReg(0))
3949     .addUse(RHS)
3950     .addUse(LHS)
3951     .setMIFlags(Flags);
3952 
3953   MI.eraseFromParent();
3954   return true;
3955 }
3956 
3957 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3958                                          MachineRegisterInfo &MRI,
3959                                          MachineIRBuilder &B) const {
3960   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3961     return true;
3962 
3963   Register Res = MI.getOperand(0).getReg();
3964   Register LHS = MI.getOperand(1).getReg();
3965   Register RHS = MI.getOperand(2).getReg();
3966 
3967   uint16_t Flags = MI.getFlags();
3968 
3969   LLT S64 = LLT::scalar(64);
3970   LLT S1 = LLT::scalar(1);
3971 
3972   auto One = B.buildFConstant(S64, 1.0);
3973 
3974   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3975     .addUse(LHS)
3976     .addUse(RHS)
3977     .addImm(0)
3978     .setMIFlags(Flags);
3979 
3980   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3981 
3982   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3983     .addUse(DivScale0.getReg(0))
3984     .setMIFlags(Flags);
3985 
3986   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3987   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3988   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3989 
3990   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3991     .addUse(LHS)
3992     .addUse(RHS)
3993     .addImm(1)
3994     .setMIFlags(Flags);
3995 
3996   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3997   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3998   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3999 
4000   Register Scale;
4001   if (!ST.hasUsableDivScaleConditionOutput()) {
4002     // Workaround a hardware bug on SI where the condition output from div_scale
4003     // is not usable.
4004 
4005     LLT S32 = LLT::scalar(32);
4006 
4007     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4008     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4009     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4010     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4011 
4012     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4013                               Scale1Unmerge.getReg(1));
4014     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4015                               Scale0Unmerge.getReg(1));
4016     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4017   } else {
4018     Scale = DivScale1.getReg(1);
4019   }
4020 
4021   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4022     .addUse(Fma4.getReg(0))
4023     .addUse(Fma3.getReg(0))
4024     .addUse(Mul.getReg(0))
4025     .addUse(Scale)
4026     .setMIFlags(Flags);
4027 
4028   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4029     .addUse(Fmas.getReg(0))
4030     .addUse(RHS)
4031     .addUse(LHS)
4032     .setMIFlags(Flags);
4033 
4034   MI.eraseFromParent();
4035   return true;
4036 }
4037 
4038 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4039                                                  MachineRegisterInfo &MRI,
4040                                                  MachineIRBuilder &B) const {
4041   Register Res = MI.getOperand(0).getReg();
4042   Register LHS = MI.getOperand(2).getReg();
4043   Register RHS = MI.getOperand(3).getReg();
4044   uint16_t Flags = MI.getFlags();
4045 
4046   LLT S32 = LLT::scalar(32);
4047   LLT S1 = LLT::scalar(1);
4048 
4049   auto Abs = B.buildFAbs(S32, RHS, Flags);
4050   const APFloat C0Val(1.0f);
4051 
4052   auto C0 = B.buildConstant(S32, 0x6f800000);
4053   auto C1 = B.buildConstant(S32, 0x2f800000);
4054   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4055 
4056   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4057   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4058 
4059   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4060 
4061   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4062     .addUse(Mul0.getReg(0))
4063     .setMIFlags(Flags);
4064 
4065   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4066 
4067   B.buildFMul(Res, Sel, Mul1, Flags);
4068 
4069   MI.eraseFromParent();
4070   return true;
4071 }
4072 
4073 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4074 // FIXME: Why do we handle this one but not other removed instructions?
4075 //
4076 // Reciprocal square root.  The clamp prevents infinite results, clamping
4077 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4078 // +-max_float.
4079 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4080                                                     MachineRegisterInfo &MRI,
4081                                                     MachineIRBuilder &B) const {
4082   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4083     return true;
4084 
4085   Register Dst = MI.getOperand(0).getReg();
4086   Register Src = MI.getOperand(2).getReg();
4087   auto Flags = MI.getFlags();
4088 
4089   LLT Ty = MRI.getType(Dst);
4090 
4091   const fltSemantics *FltSemantics;
4092   if (Ty == LLT::scalar(32))
4093     FltSemantics = &APFloat::IEEEsingle();
4094   else if (Ty == LLT::scalar(64))
4095     FltSemantics = &APFloat::IEEEdouble();
4096   else
4097     return false;
4098 
4099   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4100     .addUse(Src)
4101     .setMIFlags(Flags);
4102 
4103   // We don't need to concern ourselves with the snan handling difference, since
4104   // the rsq quieted (or not) so use the one which will directly select.
4105   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4106   const bool UseIEEE = MFI->getMode().IEEE;
4107 
4108   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4109   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4110                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4111 
4112   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4113 
4114   if (UseIEEE)
4115     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4116   else
4117     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4118   MI.eraseFromParent();
4119   return true;
4120 }
4121 
4122 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4123   switch (IID) {
4124   case Intrinsic::amdgcn_ds_fadd:
4125     return AMDGPU::G_ATOMICRMW_FADD;
4126   case Intrinsic::amdgcn_ds_fmin:
4127     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4128   case Intrinsic::amdgcn_ds_fmax:
4129     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4130   default:
4131     llvm_unreachable("not a DS FP intrinsic");
4132   }
4133 }
4134 
4135 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4136                                                       MachineInstr &MI,
4137                                                       Intrinsic::ID IID) const {
4138   GISelChangeObserver &Observer = Helper.Observer;
4139   Observer.changingInstr(MI);
4140 
4141   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4142 
4143   // The remaining operands were used to set fields in the MemOperand on
4144   // construction.
4145   for (int I = 6; I > 3; --I)
4146     MI.removeOperand(I);
4147 
4148   MI.removeOperand(1); // Remove the intrinsic ID.
4149   Observer.changedInstr(MI);
4150   return true;
4151 }
4152 
4153 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4154                                             MachineRegisterInfo &MRI,
4155                                             MachineIRBuilder &B) const {
4156   uint64_t Offset =
4157     ST.getTargetLowering()->getImplicitParameterOffset(
4158       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4159   LLT DstTy = MRI.getType(DstReg);
4160   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4161 
4162   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4163   if (!loadInputValue(KernargPtrReg, B,
4164                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4165     return false;
4166 
4167   // FIXME: This should be nuw
4168   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4169   return true;
4170 }
4171 
4172 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4173                                                  MachineRegisterInfo &MRI,
4174                                                  MachineIRBuilder &B) const {
4175   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4176   if (!MFI->isEntryFunction()) {
4177     return legalizePreloadedArgIntrin(MI, MRI, B,
4178                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4179   }
4180 
4181   Register DstReg = MI.getOperand(0).getReg();
4182   if (!getImplicitArgPtr(DstReg, MRI, B))
4183     return false;
4184 
4185   MI.eraseFromParent();
4186   return true;
4187 }
4188 
4189 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4190                                               MachineRegisterInfo &MRI,
4191                                               MachineIRBuilder &B,
4192                                               unsigned AddrSpace) const {
4193   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4194   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4195   Register Hi32 = Unmerge.getReg(1);
4196 
4197   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4198   MI.eraseFromParent();
4199   return true;
4200 }
4201 
4202 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4203 // offset (the offset that is included in bounds checking and swizzling, to be
4204 // split between the instruction's voffset and immoffset fields) and soffset
4205 // (the offset that is excluded from bounds checking and swizzling, to go in
4206 // the instruction's soffset field).  This function takes the first kind of
4207 // offset and figures out how to split it between voffset and immoffset.
4208 std::pair<Register, unsigned>
4209 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4210                                         Register OrigOffset) const {
4211   const unsigned MaxImm = 4095;
4212   Register BaseReg;
4213   unsigned ImmOffset;
4214   const LLT S32 = LLT::scalar(32);
4215   MachineRegisterInfo &MRI = *B.getMRI();
4216 
4217   std::tie(BaseReg, ImmOffset) =
4218       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4219 
4220   // If BaseReg is a pointer, convert it to int.
4221   if (MRI.getType(BaseReg).isPointer())
4222     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4223 
4224   // If the immediate value is too big for the immoffset field, put the value
4225   // and -4096 into the immoffset field so that the value that is copied/added
4226   // for the voffset field is a multiple of 4096, and it stands more chance
4227   // of being CSEd with the copy/add for another similar load/store.
4228   // However, do not do that rounding down to a multiple of 4096 if that is a
4229   // negative number, as it appears to be illegal to have a negative offset
4230   // in the vgpr, even if adding the immediate offset makes it positive.
4231   unsigned Overflow = ImmOffset & ~MaxImm;
4232   ImmOffset -= Overflow;
4233   if ((int32_t)Overflow < 0) {
4234     Overflow += ImmOffset;
4235     ImmOffset = 0;
4236   }
4237 
4238   if (Overflow != 0) {
4239     if (!BaseReg) {
4240       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4241     } else {
4242       auto OverflowVal = B.buildConstant(S32, Overflow);
4243       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4244     }
4245   }
4246 
4247   if (!BaseReg)
4248     BaseReg = B.buildConstant(S32, 0).getReg(0);
4249 
4250   return std::make_pair(BaseReg, ImmOffset);
4251 }
4252 
4253 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4254 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4255                                           Register VOffset, Register SOffset,
4256                                           unsigned ImmOffset, Register VIndex,
4257                                           MachineRegisterInfo &MRI) const {
4258   Optional<ValueAndVReg> MaybeVOffsetVal =
4259       getIConstantVRegValWithLookThrough(VOffset, MRI);
4260   Optional<ValueAndVReg> MaybeSOffsetVal =
4261       getIConstantVRegValWithLookThrough(SOffset, MRI);
4262   Optional<ValueAndVReg> MaybeVIndexVal =
4263       getIConstantVRegValWithLookThrough(VIndex, MRI);
4264   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4265   // update the MMO with that offset. The stride is unknown so we can only do
4266   // this if VIndex is constant 0.
4267   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4268       MaybeVIndexVal->Value == 0) {
4269     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4270                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4271     MMO->setOffset(TotalOffset);
4272   } else {
4273     // We don't have a constant combined offset to use in the MMO. Give up.
4274     MMO->setValue((Value *)nullptr);
4275   }
4276 }
4277 
4278 /// Handle register layout difference for f16 images for some subtargets.
4279 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4280                                              MachineRegisterInfo &MRI,
4281                                              Register Reg,
4282                                              bool ImageStore) const {
4283   const LLT S16 = LLT::scalar(16);
4284   const LLT S32 = LLT::scalar(32);
4285   LLT StoreVT = MRI.getType(Reg);
4286   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4287 
4288   if (ST.hasUnpackedD16VMem()) {
4289     auto Unmerge = B.buildUnmerge(S16, Reg);
4290 
4291     SmallVector<Register, 4> WideRegs;
4292     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4293       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4294 
4295     int NumElts = StoreVT.getNumElements();
4296 
4297     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4298         .getReg(0);
4299   }
4300 
4301   if (ImageStore && ST.hasImageStoreD16Bug()) {
4302     if (StoreVT.getNumElements() == 2) {
4303       SmallVector<Register, 4> PackedRegs;
4304       Reg = B.buildBitcast(S32, Reg).getReg(0);
4305       PackedRegs.push_back(Reg);
4306       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4307       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4308           .getReg(0);
4309     }
4310 
4311     if (StoreVT.getNumElements() == 3) {
4312       SmallVector<Register, 4> PackedRegs;
4313       auto Unmerge = B.buildUnmerge(S16, Reg);
4314       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4315         PackedRegs.push_back(Unmerge.getReg(I));
4316       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4317       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4318       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4319     }
4320 
4321     if (StoreVT.getNumElements() == 4) {
4322       SmallVector<Register, 4> PackedRegs;
4323       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4324       auto Unmerge = B.buildUnmerge(S32, Reg);
4325       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4326         PackedRegs.push_back(Unmerge.getReg(I));
4327       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4328       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4329           .getReg(0);
4330     }
4331 
4332     llvm_unreachable("invalid data type");
4333   }
4334 
4335   if (StoreVT == LLT::fixed_vector(3, S16)) {
4336     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4337               .getReg(0);
4338   }
4339   return Reg;
4340 }
4341 
4342 Register AMDGPULegalizerInfo::fixStoreSourceType(
4343   MachineIRBuilder &B, Register VData, bool IsFormat) const {
4344   MachineRegisterInfo *MRI = B.getMRI();
4345   LLT Ty = MRI->getType(VData);
4346 
4347   const LLT S16 = LLT::scalar(16);
4348 
4349   // Fixup illegal register types for i8 stores.
4350   if (Ty == LLT::scalar(8) || Ty == S16) {
4351     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4352     return AnyExt;
4353   }
4354 
4355   if (Ty.isVector()) {
4356     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4357       if (IsFormat)
4358         return handleD16VData(B, *MRI, VData);
4359     }
4360   }
4361 
4362   return VData;
4363 }
4364 
4365 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4366                                               MachineRegisterInfo &MRI,
4367                                               MachineIRBuilder &B,
4368                                               bool IsTyped,
4369                                               bool IsFormat) const {
4370   Register VData = MI.getOperand(1).getReg();
4371   LLT Ty = MRI.getType(VData);
4372   LLT EltTy = Ty.getScalarType();
4373   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4374   const LLT S32 = LLT::scalar(32);
4375 
4376   VData = fixStoreSourceType(B, VData, IsFormat);
4377   Register RSrc = MI.getOperand(2).getReg();
4378 
4379   MachineMemOperand *MMO = *MI.memoperands_begin();
4380   const int MemSize = MMO->getSize();
4381 
4382   unsigned ImmOffset;
4383 
4384   // The typed intrinsics add an immediate after the registers.
4385   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4386 
4387   // The struct intrinsic variants add one additional operand over raw.
4388   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4389   Register VIndex;
4390   int OpOffset = 0;
4391   if (HasVIndex) {
4392     VIndex = MI.getOperand(3).getReg();
4393     OpOffset = 1;
4394   } else {
4395     VIndex = B.buildConstant(S32, 0).getReg(0);
4396   }
4397 
4398   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4399   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4400 
4401   unsigned Format = 0;
4402   if (IsTyped) {
4403     Format = MI.getOperand(5 + OpOffset).getImm();
4404     ++OpOffset;
4405   }
4406 
4407   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4408 
4409   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4410   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4411 
4412   unsigned Opc;
4413   if (IsTyped) {
4414     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4415                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4416   } else if (IsFormat) {
4417     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4418                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4419   } else {
4420     switch (MemSize) {
4421     case 1:
4422       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4423       break;
4424     case 2:
4425       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4426       break;
4427     default:
4428       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4429       break;
4430     }
4431   }
4432 
4433   auto MIB = B.buildInstr(Opc)
4434     .addUse(VData)              // vdata
4435     .addUse(RSrc)               // rsrc
4436     .addUse(VIndex)             // vindex
4437     .addUse(VOffset)            // voffset
4438     .addUse(SOffset)            // soffset
4439     .addImm(ImmOffset);         // offset(imm)
4440 
4441   if (IsTyped)
4442     MIB.addImm(Format);
4443 
4444   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4445      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4446      .addMemOperand(MMO);
4447 
4448   MI.eraseFromParent();
4449   return true;
4450 }
4451 
4452 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4453                                              MachineRegisterInfo &MRI,
4454                                              MachineIRBuilder &B,
4455                                              bool IsFormat,
4456                                              bool IsTyped) const {
4457   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4458   MachineMemOperand *MMO = *MI.memoperands_begin();
4459   const LLT MemTy = MMO->getMemoryType();
4460   const LLT S32 = LLT::scalar(32);
4461 
4462   Register Dst = MI.getOperand(0).getReg();
4463   Register RSrc = MI.getOperand(2).getReg();
4464 
4465   // The typed intrinsics add an immediate after the registers.
4466   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4467 
4468   // The struct intrinsic variants add one additional operand over raw.
4469   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4470   Register VIndex;
4471   int OpOffset = 0;
4472   if (HasVIndex) {
4473     VIndex = MI.getOperand(3).getReg();
4474     OpOffset = 1;
4475   } else {
4476     VIndex = B.buildConstant(S32, 0).getReg(0);
4477   }
4478 
4479   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4480   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4481 
4482   unsigned Format = 0;
4483   if (IsTyped) {
4484     Format = MI.getOperand(5 + OpOffset).getImm();
4485     ++OpOffset;
4486   }
4487 
4488   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4489   unsigned ImmOffset;
4490 
4491   LLT Ty = MRI.getType(Dst);
4492   LLT EltTy = Ty.getScalarType();
4493   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4494   const bool Unpacked = ST.hasUnpackedD16VMem();
4495 
4496   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4497   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4498 
4499   unsigned Opc;
4500 
4501   if (IsTyped) {
4502     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4503                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4504   } else if (IsFormat) {
4505     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4506                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4507   } else {
4508     switch (MemTy.getSizeInBits()) {
4509     case 8:
4510       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4511       break;
4512     case 16:
4513       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4514       break;
4515     default:
4516       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4517       break;
4518     }
4519   }
4520 
4521   Register LoadDstReg;
4522 
4523   bool IsExtLoad =
4524       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4525   LLT UnpackedTy = Ty.changeElementSize(32);
4526 
4527   if (IsExtLoad)
4528     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4529   else if (Unpacked && IsD16 && Ty.isVector())
4530     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4531   else
4532     LoadDstReg = Dst;
4533 
4534   auto MIB = B.buildInstr(Opc)
4535     .addDef(LoadDstReg)         // vdata
4536     .addUse(RSrc)               // rsrc
4537     .addUse(VIndex)             // vindex
4538     .addUse(VOffset)            // voffset
4539     .addUse(SOffset)            // soffset
4540     .addImm(ImmOffset);         // offset(imm)
4541 
4542   if (IsTyped)
4543     MIB.addImm(Format);
4544 
4545   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4546      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4547      .addMemOperand(MMO);
4548 
4549   if (LoadDstReg != Dst) {
4550     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4551 
4552     // Widen result for extending loads was widened.
4553     if (IsExtLoad)
4554       B.buildTrunc(Dst, LoadDstReg);
4555     else {
4556       // Repack to original 16-bit vector result
4557       // FIXME: G_TRUNC should work, but legalization currently fails
4558       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4559       SmallVector<Register, 4> Repack;
4560       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4561         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4562       B.buildMerge(Dst, Repack);
4563     }
4564   }
4565 
4566   MI.eraseFromParent();
4567   return true;
4568 }
4569 
4570 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4571                                                MachineIRBuilder &B,
4572                                                bool IsInc) const {
4573   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4574                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4575   B.buildInstr(Opc)
4576     .addDef(MI.getOperand(0).getReg())
4577     .addUse(MI.getOperand(2).getReg())
4578     .addUse(MI.getOperand(3).getReg())
4579     .cloneMemRefs(MI);
4580   MI.eraseFromParent();
4581   return true;
4582 }
4583 
4584 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4585   switch (IntrID) {
4586   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4587   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4588     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4589   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4590   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4591     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4592   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4593   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4594     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4595   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4596   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4597     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4598   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4599   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4600     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4601   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4602   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4603     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4604   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4605   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4606     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4607   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4608   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4609     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4610   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4611   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4612     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4613   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4614   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4615     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4616   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4617   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4618     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4619   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4620   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4621     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4622   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4623   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4624     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4625   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4626   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4627     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4628   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4629   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4630     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4631   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4632   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4633     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4634   default:
4635     llvm_unreachable("unhandled atomic opcode");
4636   }
4637 }
4638 
4639 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4640                                                MachineIRBuilder &B,
4641                                                Intrinsic::ID IID) const {
4642   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4643                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4644   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4645 
4646   Register Dst;
4647 
4648   int OpOffset = 0;
4649   if (HasReturn) {
4650     // A few FP atomics do not support return values.
4651     Dst = MI.getOperand(0).getReg();
4652   } else {
4653     OpOffset = -1;
4654   }
4655 
4656   Register VData = MI.getOperand(2 + OpOffset).getReg();
4657   Register CmpVal;
4658 
4659   if (IsCmpSwap) {
4660     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4661     ++OpOffset;
4662   }
4663 
4664   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4665   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4666 
4667   // The struct intrinsic variants add one additional operand over raw.
4668   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4669   Register VIndex;
4670   if (HasVIndex) {
4671     VIndex = MI.getOperand(4 + OpOffset).getReg();
4672     ++OpOffset;
4673   } else {
4674     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4675   }
4676 
4677   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4678   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4679   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4680 
4681   MachineMemOperand *MMO = *MI.memoperands_begin();
4682 
4683   unsigned ImmOffset;
4684   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4685   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4686 
4687   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4688 
4689   if (HasReturn)
4690     MIB.addDef(Dst);
4691 
4692   MIB.addUse(VData); // vdata
4693 
4694   if (IsCmpSwap)
4695     MIB.addReg(CmpVal);
4696 
4697   MIB.addUse(RSrc)               // rsrc
4698      .addUse(VIndex)             // vindex
4699      .addUse(VOffset)            // voffset
4700      .addUse(SOffset)            // soffset
4701      .addImm(ImmOffset)          // offset(imm)
4702      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4703      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4704      .addMemOperand(MMO);
4705 
4706   MI.eraseFromParent();
4707   return true;
4708 }
4709 
4710 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4711 /// vector with s16 typed elements.
4712 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4713                                       SmallVectorImpl<Register> &PackedAddrs,
4714                                       unsigned ArgOffset,
4715                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4716                                       bool IsA16, bool IsG16) {
4717   const LLT S16 = LLT::scalar(16);
4718   const LLT V2S16 = LLT::fixed_vector(2, 16);
4719   auto EndIdx = Intr->VAddrEnd;
4720 
4721   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4722     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4723     if (!SrcOp.isReg())
4724       continue; // _L to _LZ may have eliminated this.
4725 
4726     Register AddrReg = SrcOp.getReg();
4727 
4728     if ((I < Intr->GradientStart) ||
4729         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4730         (I >= Intr->CoordStart && !IsA16)) {
4731       if ((I < Intr->GradientStart) && IsA16 &&
4732           (B.getMRI()->getType(AddrReg) == S16)) {
4733         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4734         // Special handling of bias when A16 is on. Bias is of type half but
4735         // occupies full 32-bit.
4736         PackedAddrs.push_back(
4737             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4738                 .getReg(0));
4739       } else {
4740         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4741                "Bias needs to be converted to 16 bit in A16 mode");
4742         // Handle any gradient or coordinate operands that should not be packed
4743         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4744         PackedAddrs.push_back(AddrReg);
4745       }
4746     } else {
4747       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4748       // derivatives dx/dh and dx/dv are packed with undef.
4749       if (((I + 1) >= EndIdx) ||
4750           ((Intr->NumGradients / 2) % 2 == 1 &&
4751            (I == static_cast<unsigned>(Intr->GradientStart +
4752                                        (Intr->NumGradients / 2) - 1) ||
4753             I == static_cast<unsigned>(Intr->GradientStart +
4754                                        Intr->NumGradients - 1))) ||
4755           // Check for _L to _LZ optimization
4756           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4757         PackedAddrs.push_back(
4758             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4759                 .getReg(0));
4760       } else {
4761         PackedAddrs.push_back(
4762             B.buildBuildVector(
4763                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4764                 .getReg(0));
4765         ++I;
4766       }
4767     }
4768   }
4769 }
4770 
4771 /// Convert from separate vaddr components to a single vector address register,
4772 /// and replace the remaining operands with $noreg.
4773 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4774                                      int DimIdx, int NumVAddrs) {
4775   const LLT S32 = LLT::scalar(32);
4776 
4777   SmallVector<Register, 8> AddrRegs;
4778   for (int I = 0; I != NumVAddrs; ++I) {
4779     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4780     if (SrcOp.isReg()) {
4781       AddrRegs.push_back(SrcOp.getReg());
4782       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4783     }
4784   }
4785 
4786   int NumAddrRegs = AddrRegs.size();
4787   if (NumAddrRegs != 1) {
4788     // Above 8 elements round up to next power of 2 (i.e. 16).
4789     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4790       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4791       auto Undef = B.buildUndef(S32);
4792       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4793       NumAddrRegs = RoundedNumRegs;
4794     }
4795 
4796     auto VAddr =
4797         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4798     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4799   }
4800 
4801   for (int I = 1; I != NumVAddrs; ++I) {
4802     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4803     if (SrcOp.isReg())
4804       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4805   }
4806 }
4807 
4808 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4809 ///
4810 /// Depending on the subtarget, load/store with 16-bit element data need to be
4811 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4812 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4813 /// registers.
4814 ///
4815 /// We don't want to directly select image instructions just yet, but also want
4816 /// to exposes all register repacking to the legalizer/combiners. We also don't
4817 /// want a selected instruction entering RegBankSelect. In order to avoid
4818 /// defining a multitude of intermediate image instructions, directly hack on
4819 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4820 /// padding now unnecessary arguments with $noreg.
4821 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4822     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4823     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4824 
4825   const unsigned NumDefs = MI.getNumExplicitDefs();
4826   const unsigned ArgOffset = NumDefs + 1;
4827   bool IsTFE = NumDefs == 2;
4828   // We are only processing the operands of d16 image operations on subtargets
4829   // that use the unpacked register layout, or need to repack the TFE result.
4830 
4831   // TODO: Do we need to guard against already legalized intrinsics?
4832   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4833       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4834 
4835   MachineRegisterInfo *MRI = B.getMRI();
4836   const LLT S32 = LLT::scalar(32);
4837   const LLT S16 = LLT::scalar(16);
4838   const LLT V2S16 = LLT::fixed_vector(2, 16);
4839 
4840   unsigned DMask = 0;
4841   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4842   LLT Ty = MRI->getType(VData);
4843 
4844   // Check for 16 bit addresses and pack if true.
4845   LLT GradTy =
4846       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4847   LLT AddrTy =
4848       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4849   const bool IsG16 = GradTy == S16;
4850   const bool IsA16 = AddrTy == S16;
4851   const bool IsD16 = Ty.getScalarType() == S16;
4852 
4853   int DMaskLanes = 0;
4854   if (!BaseOpcode->Atomic) {
4855     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4856     if (BaseOpcode->Gather4) {
4857       DMaskLanes = 4;
4858     } else if (DMask != 0) {
4859       DMaskLanes = countPopulation(DMask);
4860     } else if (!IsTFE && !BaseOpcode->Store) {
4861       // If dmask is 0, this is a no-op load. This can be eliminated.
4862       B.buildUndef(MI.getOperand(0));
4863       MI.eraseFromParent();
4864       return true;
4865     }
4866   }
4867 
4868   Observer.changingInstr(MI);
4869   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4870 
4871   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4872                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4873   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4874                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4875   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4876 
4877   // Track that we legalized this
4878   MI.setDesc(B.getTII().get(NewOpcode));
4879 
4880   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4881   // dmask to be at least 1 otherwise the instruction will fail
4882   if (IsTFE && DMask == 0) {
4883     DMask = 0x1;
4884     DMaskLanes = 1;
4885     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4886   }
4887 
4888   if (BaseOpcode->Atomic) {
4889     Register VData0 = MI.getOperand(2).getReg();
4890     LLT Ty = MRI->getType(VData0);
4891 
4892     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4893     if (Ty.isVector())
4894       return false;
4895 
4896     if (BaseOpcode->AtomicX2) {
4897       Register VData1 = MI.getOperand(3).getReg();
4898       // The two values are packed in one register.
4899       LLT PackedTy = LLT::fixed_vector(2, Ty);
4900       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4901       MI.getOperand(2).setReg(Concat.getReg(0));
4902       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4903     }
4904   }
4905 
4906   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4907 
4908   // Rewrite the addressing register layout before doing anything else.
4909   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4910     // 16 bit gradients are supported, but are tied to the A16 control
4911     // so both gradients and addresses must be 16 bit
4912     return false;
4913   }
4914 
4915   if (IsA16 && !ST.hasA16()) {
4916     // A16 not supported
4917     return false;
4918   }
4919 
4920   if (IsA16 || IsG16) {
4921     if (Intr->NumVAddrs > 1) {
4922       SmallVector<Register, 4> PackedRegs;
4923 
4924       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4925                                 IsG16);
4926 
4927       // See also below in the non-a16 branch
4928       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4929                           PackedRegs.size() <= ST.getNSAMaxSize();
4930 
4931       if (!UseNSA && PackedRegs.size() > 1) {
4932         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4933         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4934         PackedRegs[0] = Concat.getReg(0);
4935         PackedRegs.resize(1);
4936       }
4937 
4938       const unsigned NumPacked = PackedRegs.size();
4939       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4940         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4941         if (!SrcOp.isReg()) {
4942           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4943           continue;
4944         }
4945 
4946         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4947 
4948         if (I - Intr->VAddrStart < NumPacked)
4949           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4950         else
4951           SrcOp.setReg(AMDGPU::NoRegister);
4952       }
4953     }
4954   } else {
4955     // If the register allocator cannot place the address registers contiguously
4956     // without introducing moves, then using the non-sequential address encoding
4957     // is always preferable, since it saves VALU instructions and is usually a
4958     // wash in terms of code size or even better.
4959     //
4960     // However, we currently have no way of hinting to the register allocator
4961     // that MIMG addresses should be placed contiguously when it is possible to
4962     // do so, so force non-NSA for the common 2-address case as a heuristic.
4963     //
4964     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4965     // allocation when possible.
4966     //
4967     // TODO: we can actually allow partial NSA where the final register is a
4968     // contiguous set of the remaining addresses.
4969     // This could help where there are more addresses than supported.
4970     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4971                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4972 
4973     if (!UseNSA && Intr->NumVAddrs > 1)
4974       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4975                                Intr->NumVAddrs);
4976   }
4977 
4978   int Flags = 0;
4979   if (IsA16)
4980     Flags |= 1;
4981   if (IsG16)
4982     Flags |= 2;
4983   MI.addOperand(MachineOperand::CreateImm(Flags));
4984 
4985   if (BaseOpcode->Store) { // No TFE for stores?
4986     // TODO: Handle dmask trim
4987     if (!Ty.isVector() || !IsD16)
4988       return true;
4989 
4990     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4991     if (RepackedReg != VData) {
4992       MI.getOperand(1).setReg(RepackedReg);
4993     }
4994 
4995     return true;
4996   }
4997 
4998   Register DstReg = MI.getOperand(0).getReg();
4999   const LLT EltTy = Ty.getScalarType();
5000   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5001 
5002   // Confirm that the return type is large enough for the dmask specified
5003   if (NumElts < DMaskLanes)
5004     return false;
5005 
5006   if (NumElts > 4 || DMaskLanes > 4)
5007     return false;
5008 
5009   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5010   const LLT AdjustedTy =
5011       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5012 
5013   // The raw dword aligned data component of the load. The only legal cases
5014   // where this matters should be when using the packed D16 format, for
5015   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5016   LLT RoundedTy;
5017 
5018   // S32 vector to to cover all data, plus TFE result element.
5019   LLT TFETy;
5020 
5021   // Register type to use for each loaded component. Will be S32 or V2S16.
5022   LLT RegTy;
5023 
5024   if (IsD16 && ST.hasUnpackedD16VMem()) {
5025     RoundedTy =
5026         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5027     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5028     RegTy = S32;
5029   } else {
5030     unsigned EltSize = EltTy.getSizeInBits();
5031     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5032     unsigned RoundedSize = 32 * RoundedElts;
5033     RoundedTy = LLT::scalarOrVector(
5034         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5035     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5036     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5037   }
5038 
5039   // The return type does not need adjustment.
5040   // TODO: Should we change s16 case to s32 or <2 x s16>?
5041   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5042     return true;
5043 
5044   Register Dst1Reg;
5045 
5046   // Insert after the instruction.
5047   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5048 
5049   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5050   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5051   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5052   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5053 
5054   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5055 
5056   MI.getOperand(0).setReg(NewResultReg);
5057 
5058   // In the IR, TFE is supposed to be used with a 2 element struct return
5059   // type. The instruction really returns these two values in one contiguous
5060   // register, with one additional dword beyond the loaded data. Rewrite the
5061   // return type to use a single register result.
5062 
5063   if (IsTFE) {
5064     Dst1Reg = MI.getOperand(1).getReg();
5065     if (MRI->getType(Dst1Reg) != S32)
5066       return false;
5067 
5068     // TODO: Make sure the TFE operand bit is set.
5069     MI.removeOperand(1);
5070 
5071     // Handle the easy case that requires no repack instructions.
5072     if (Ty == S32) {
5073       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5074       return true;
5075     }
5076   }
5077 
5078   // Now figure out how to copy the new result register back into the old
5079   // result.
5080   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5081 
5082   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
5083 
5084   if (ResultNumRegs == 1) {
5085     assert(!IsTFE);
5086     ResultRegs[0] = NewResultReg;
5087   } else {
5088     // We have to repack into a new vector of some kind.
5089     for (int I = 0; I != NumDataRegs; ++I)
5090       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5091     B.buildUnmerge(ResultRegs, NewResultReg);
5092 
5093     // Drop the final TFE element to get the data part. The TFE result is
5094     // directly written to the right place already.
5095     if (IsTFE)
5096       ResultRegs.resize(NumDataRegs);
5097   }
5098 
5099   // For an s16 scalar result, we form an s32 result with a truncate regardless
5100   // of packed vs. unpacked.
5101   if (IsD16 && !Ty.isVector()) {
5102     B.buildTrunc(DstReg, ResultRegs[0]);
5103     return true;
5104   }
5105 
5106   // Avoid a build/concat_vector of 1 entry.
5107   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5108     B.buildBitcast(DstReg, ResultRegs[0]);
5109     return true;
5110   }
5111 
5112   assert(Ty.isVector());
5113 
5114   if (IsD16) {
5115     // For packed D16 results with TFE enabled, all the data components are
5116     // S32. Cast back to the expected type.
5117     //
5118     // TODO: We don't really need to use load s32 elements. We would only need one
5119     // cast for the TFE result if a multiple of v2s16 was used.
5120     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5121       for (Register &Reg : ResultRegs)
5122         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5123     } else if (ST.hasUnpackedD16VMem()) {
5124       for (Register &Reg : ResultRegs)
5125         Reg = B.buildTrunc(S16, Reg).getReg(0);
5126     }
5127   }
5128 
5129   auto padWithUndef = [&](LLT Ty, int NumElts) {
5130     if (NumElts == 0)
5131       return;
5132     Register Undef = B.buildUndef(Ty).getReg(0);
5133     for (int I = 0; I != NumElts; ++I)
5134       ResultRegs.push_back(Undef);
5135   };
5136 
5137   // Pad out any elements eliminated due to the dmask.
5138   LLT ResTy = MRI->getType(ResultRegs[0]);
5139   if (!ResTy.isVector()) {
5140     padWithUndef(ResTy, NumElts - ResultRegs.size());
5141     B.buildBuildVector(DstReg, ResultRegs);
5142     return true;
5143   }
5144 
5145   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5146   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5147 
5148   // Deal with the one annoying legal case.
5149   const LLT V3S16 = LLT::fixed_vector(3, 16);
5150   if (Ty == V3S16) {
5151     if (IsTFE) {
5152       if (ResultRegs.size() == 1) {
5153         NewResultReg = ResultRegs[0];
5154       } else if (ResultRegs.size() == 2) {
5155         LLT V4S16 = LLT::fixed_vector(4, 16);
5156         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5157       } else {
5158         return false;
5159       }
5160     }
5161 
5162     if (MRI->getType(DstReg).getNumElements() <
5163         MRI->getType(NewResultReg).getNumElements()) {
5164       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5165     } else {
5166       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5167     }
5168     return true;
5169   }
5170 
5171   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5172   B.buildConcatVectors(DstReg, ResultRegs);
5173   return true;
5174 }
5175 
5176 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5177   LegalizerHelper &Helper, MachineInstr &MI) const {
5178   MachineIRBuilder &B = Helper.MIRBuilder;
5179   GISelChangeObserver &Observer = Helper.Observer;
5180 
5181   Register Dst = MI.getOperand(0).getReg();
5182   LLT Ty = B.getMRI()->getType(Dst);
5183   unsigned Size = Ty.getSizeInBits();
5184   MachineFunction &MF = B.getMF();
5185 
5186   Observer.changingInstr(MI);
5187 
5188   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5189     Ty = getBitcastRegisterType(Ty);
5190     Helper.bitcastDst(MI, Ty, 0);
5191     Dst = MI.getOperand(0).getReg();
5192     B.setInsertPt(B.getMBB(), MI);
5193   }
5194 
5195   // FIXME: We don't really need this intermediate instruction. The intrinsic
5196   // should be fixed to have a memory operand. Since it's readnone, we're not
5197   // allowed to add one.
5198   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5199   MI.removeOperand(1); // Remove intrinsic ID
5200 
5201   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5202   // TODO: Should this use datalayout alignment?
5203   const unsigned MemSize = (Size + 7) / 8;
5204   const Align MemAlign(4);
5205   MachineMemOperand *MMO = MF.getMachineMemOperand(
5206       MachinePointerInfo(),
5207       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5208           MachineMemOperand::MOInvariant,
5209       MemSize, MemAlign);
5210   MI.addMemOperand(MF, MMO);
5211 
5212   // There are no 96-bit result scalar loads, but widening to 128-bit should
5213   // always be legal. We may need to restore this to a 96-bit result if it turns
5214   // out this needs to be converted to a vector load during RegBankSelect.
5215   if (!isPowerOf2_32(Size)) {
5216     if (Ty.isVector())
5217       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5218     else
5219       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5220   }
5221 
5222   Observer.changedInstr(MI);
5223   return true;
5224 }
5225 
5226 // TODO: Move to selection
5227 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5228                                                 MachineRegisterInfo &MRI,
5229                                                 MachineIRBuilder &B) const {
5230   if (!ST.isTrapHandlerEnabled() ||
5231       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5232     return legalizeTrapEndpgm(MI, MRI, B);
5233 
5234   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5235     switch (*HsaAbiVer) {
5236     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5237     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5238       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5239     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5240     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5241       return ST.supportsGetDoorbellID() ?
5242           legalizeTrapHsa(MI, MRI, B) :
5243           legalizeTrapHsaQueuePtr(MI, MRI, B);
5244     }
5245   }
5246 
5247   llvm_unreachable("Unknown trap handler");
5248 }
5249 
5250 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5251     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5252   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5253   MI.eraseFromParent();
5254   return true;
5255 }
5256 
5257 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5258     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5259   MachineFunction &MF = B.getMF();
5260   const LLT S64 = LLT::scalar(64);
5261 
5262   Register SGPR01(AMDGPU::SGPR0_SGPR1);
5263   // For code object version 5, queue_ptr is passed through implicit kernarg.
5264   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5265     AMDGPUTargetLowering::ImplicitParameter Param =
5266         AMDGPUTargetLowering::QUEUE_PTR;
5267     uint64_t Offset =
5268         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5269 
5270     Register KernargPtrReg = MRI.createGenericVirtualRegister(
5271         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5272 
5273     if (!loadInputValue(KernargPtrReg, B,
5274                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5275       return false;
5276 
5277     // TODO: can we be smarter about machine pointer info?
5278     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5279     MachineMemOperand *MMO = MF.getMachineMemOperand(
5280         PtrInfo,
5281         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5282             MachineMemOperand::MOInvariant,
5283         LLT::scalar(64), commonAlignment(Align(64), Offset));
5284 
5285     // Pointer address
5286     Register LoadAddr = MRI.createGenericVirtualRegister(
5287         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5288     B.buildPtrAdd(LoadAddr, KernargPtrReg,
5289                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5290     // Load address
5291     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5292     B.buildCopy(SGPR01, Temp);
5293     B.buildInstr(AMDGPU::S_TRAP)
5294         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5295         .addReg(SGPR01, RegState::Implicit);
5296     MI.eraseFromParent();
5297     return true;
5298   }
5299 
5300   // Pass queue pointer to trap handler as input, and insert trap instruction
5301   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5302   Register LiveIn =
5303     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5304   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5305     return false;
5306 
5307   B.buildCopy(SGPR01, LiveIn);
5308   B.buildInstr(AMDGPU::S_TRAP)
5309       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5310       .addReg(SGPR01, RegState::Implicit);
5311 
5312   MI.eraseFromParent();
5313   return true;
5314 }
5315 
5316 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5317     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5318   B.buildInstr(AMDGPU::S_TRAP)
5319       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5320   MI.eraseFromParent();
5321   return true;
5322 }
5323 
5324 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5325     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5326   // Is non-HSA path or trap-handler disabled? Then, report a warning
5327   // accordingly
5328   if (!ST.isTrapHandlerEnabled() ||
5329       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5330     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5331                                      "debugtrap handler not supported",
5332                                      MI.getDebugLoc(), DS_Warning);
5333     LLVMContext &Ctx = B.getMF().getFunction().getContext();
5334     Ctx.diagnose(NoTrap);
5335   } else {
5336     // Insert debug-trap instruction
5337     B.buildInstr(AMDGPU::S_TRAP)
5338         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5339   }
5340 
5341   MI.eraseFromParent();
5342   return true;
5343 }
5344 
5345 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5346                                                MachineIRBuilder &B) const {
5347   MachineRegisterInfo &MRI = *B.getMRI();
5348   const LLT S16 = LLT::scalar(16);
5349   const LLT S32 = LLT::scalar(32);
5350   const LLT V2S16 = LLT::fixed_vector(2, 16);
5351   const LLT V3S32 = LLT::fixed_vector(3, 32);
5352 
5353   Register DstReg = MI.getOperand(0).getReg();
5354   Register NodePtr = MI.getOperand(2).getReg();
5355   Register RayExtent = MI.getOperand(3).getReg();
5356   Register RayOrigin = MI.getOperand(4).getReg();
5357   Register RayDir = MI.getOperand(5).getReg();
5358   Register RayInvDir = MI.getOperand(6).getReg();
5359   Register TDescr = MI.getOperand(7).getReg();
5360 
5361   if (!ST.hasGFX10_AEncoding()) {
5362     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5363                                         "intrinsic not supported on subtarget",
5364                                         MI.getDebugLoc());
5365     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5366     return false;
5367   }
5368 
5369   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5370   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5371   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5372   const unsigned NumVDataDwords = 4;
5373   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5374   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5375   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5376   const unsigned BaseOpcodes[2][2] = {
5377       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5378       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5379        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5380   int Opcode;
5381   if (UseNSA) {
5382     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5383                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5384                                                : AMDGPU::MIMGEncGfx10NSA,
5385                                    NumVDataDwords, NumVAddrDwords);
5386   } else {
5387     Opcode = AMDGPU::getMIMGOpcode(
5388         BaseOpcodes[Is64][IsA16],
5389         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5390         NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
5391   }
5392   assert(Opcode != -1);
5393 
5394   SmallVector<Register, 12> Ops;
5395   if (UseNSA && IsGFX11Plus) {
5396     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5397       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5398       auto Merged = B.buildMerge(
5399           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5400       Ops.push_back(Merged.getReg(0));
5401     };
5402 
5403     Ops.push_back(NodePtr);
5404     Ops.push_back(RayExtent);
5405     packLanes(RayOrigin);
5406 
5407     if (IsA16) {
5408       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5409       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5410       auto MergedDir = B.buildMerge(
5411           V3S32,
5412           {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
5413                                                     UnmergeRayDir.getReg(0)}))
5414                .getReg(0),
5415            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
5416                                                     UnmergeRayDir.getReg(1)}))
5417                .getReg(0),
5418            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
5419                                                     UnmergeRayDir.getReg(2)}))
5420                .getReg(0)});
5421       Ops.push_back(MergedDir.getReg(0));
5422     } else {
5423       packLanes(RayDir);
5424       packLanes(RayInvDir);
5425     }
5426   } else {
5427     if (Is64) {
5428       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5429       Ops.push_back(Unmerge.getReg(0));
5430       Ops.push_back(Unmerge.getReg(1));
5431     } else {
5432       Ops.push_back(NodePtr);
5433     }
5434     Ops.push_back(RayExtent);
5435 
5436     auto packLanes = [&Ops, &S32, &B](Register Src) {
5437       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5438       Ops.push_back(Unmerge.getReg(0));
5439       Ops.push_back(Unmerge.getReg(1));
5440       Ops.push_back(Unmerge.getReg(2));
5441     };
5442 
5443     packLanes(RayOrigin);
5444     if (IsA16) {
5445       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5446       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5447       Register R1 = MRI.createGenericVirtualRegister(S32);
5448       Register R2 = MRI.createGenericVirtualRegister(S32);
5449       Register R3 = MRI.createGenericVirtualRegister(S32);
5450       B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5451       B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5452       B.buildMerge(R3,
5453                    {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5454       Ops.push_back(R1);
5455       Ops.push_back(R2);
5456       Ops.push_back(R3);
5457     } else {
5458       packLanes(RayDir);
5459       packLanes(RayInvDir);
5460     }
5461   }
5462 
5463   if (!UseNSA) {
5464     // Build a single vector containing all the operands so far prepared.
5465     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5466     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5467     Ops.clear();
5468     Ops.push_back(MergedOps);
5469   }
5470 
5471   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5472     .addDef(DstReg)
5473     .addImm(Opcode);
5474 
5475   for (Register R : Ops) {
5476     MIB.addUse(R);
5477   }
5478 
5479   MIB.addUse(TDescr)
5480      .addImm(IsA16 ? 1 : 0)
5481      .cloneMemRefs(MI);
5482 
5483   MI.eraseFromParent();
5484   return true;
5485 }
5486 
5487 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5488                                                MachineIRBuilder &B) const {
5489   unsigned Opc;
5490   int RoundMode = MI.getOperand(2).getImm();
5491 
5492   if (RoundMode == (int)RoundingMode::TowardPositive)
5493     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5494   else if (RoundMode == (int)RoundingMode::TowardNegative)
5495     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5496   else
5497     return false;
5498 
5499   B.buildInstr(Opc)
5500       .addDef(MI.getOperand(0).getReg())
5501       .addUse(MI.getOperand(1).getReg());
5502 
5503   MI.eraseFromParent();
5504 
5505   return true;
5506 }
5507 
5508 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5509                                             MachineInstr &MI) const {
5510   MachineIRBuilder &B = Helper.MIRBuilder;
5511   MachineRegisterInfo &MRI = *B.getMRI();
5512 
5513   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5514   auto IntrID = MI.getIntrinsicID();
5515   switch (IntrID) {
5516   case Intrinsic::amdgcn_if:
5517   case Intrinsic::amdgcn_else: {
5518     MachineInstr *Br = nullptr;
5519     MachineBasicBlock *UncondBrTarget = nullptr;
5520     bool Negated = false;
5521     if (MachineInstr *BrCond =
5522             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5523       const SIRegisterInfo *TRI
5524         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5525 
5526       Register Def = MI.getOperand(1).getReg();
5527       Register Use = MI.getOperand(3).getReg();
5528 
5529       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5530 
5531       if (Negated)
5532         std::swap(CondBrTarget, UncondBrTarget);
5533 
5534       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5535       if (IntrID == Intrinsic::amdgcn_if) {
5536         B.buildInstr(AMDGPU::SI_IF)
5537           .addDef(Def)
5538           .addUse(Use)
5539           .addMBB(UncondBrTarget);
5540       } else {
5541         B.buildInstr(AMDGPU::SI_ELSE)
5542             .addDef(Def)
5543             .addUse(Use)
5544             .addMBB(UncondBrTarget);
5545       }
5546 
5547       if (Br) {
5548         Br->getOperand(0).setMBB(CondBrTarget);
5549       } else {
5550         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5551         // since we're swapping branch targets it needs to be reinserted.
5552         // FIXME: IRTranslator should probably not do this
5553         B.buildBr(*CondBrTarget);
5554       }
5555 
5556       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5557       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5558       MI.eraseFromParent();
5559       BrCond->eraseFromParent();
5560       return true;
5561     }
5562 
5563     return false;
5564   }
5565   case Intrinsic::amdgcn_loop: {
5566     MachineInstr *Br = nullptr;
5567     MachineBasicBlock *UncondBrTarget = nullptr;
5568     bool Negated = false;
5569     if (MachineInstr *BrCond =
5570             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5571       const SIRegisterInfo *TRI
5572         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5573 
5574       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5575       Register Reg = MI.getOperand(2).getReg();
5576 
5577       if (Negated)
5578         std::swap(CondBrTarget, UncondBrTarget);
5579 
5580       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5581       B.buildInstr(AMDGPU::SI_LOOP)
5582         .addUse(Reg)
5583         .addMBB(UncondBrTarget);
5584 
5585       if (Br)
5586         Br->getOperand(0).setMBB(CondBrTarget);
5587       else
5588         B.buildBr(*CondBrTarget);
5589 
5590       MI.eraseFromParent();
5591       BrCond->eraseFromParent();
5592       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5593       return true;
5594     }
5595 
5596     return false;
5597   }
5598   case Intrinsic::amdgcn_kernarg_segment_ptr:
5599     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5600       // This only makes sense to call in a kernel, so just lower to null.
5601       B.buildConstant(MI.getOperand(0).getReg(), 0);
5602       MI.eraseFromParent();
5603       return true;
5604     }
5605 
5606     return legalizePreloadedArgIntrin(
5607       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5608   case Intrinsic::amdgcn_implicitarg_ptr:
5609     return legalizeImplicitArgPtr(MI, MRI, B);
5610   case Intrinsic::amdgcn_workitem_id_x:
5611     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5612                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5613   case Intrinsic::amdgcn_workitem_id_y:
5614     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5615                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5616   case Intrinsic::amdgcn_workitem_id_z:
5617     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5618                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5619   case Intrinsic::amdgcn_workgroup_id_x:
5620     return legalizePreloadedArgIntrin(MI, MRI, B,
5621                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5622   case Intrinsic::amdgcn_workgroup_id_y:
5623     return legalizePreloadedArgIntrin(MI, MRI, B,
5624                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5625   case Intrinsic::amdgcn_workgroup_id_z:
5626     return legalizePreloadedArgIntrin(MI, MRI, B,
5627                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5628   case Intrinsic::amdgcn_dispatch_ptr:
5629     return legalizePreloadedArgIntrin(MI, MRI, B,
5630                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5631   case Intrinsic::amdgcn_queue_ptr:
5632     return legalizePreloadedArgIntrin(MI, MRI, B,
5633                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5634   case Intrinsic::amdgcn_implicit_buffer_ptr:
5635     return legalizePreloadedArgIntrin(
5636       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5637   case Intrinsic::amdgcn_dispatch_id:
5638     return legalizePreloadedArgIntrin(MI, MRI, B,
5639                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5640   case Intrinsic::r600_read_ngroups_x:
5641     // TODO: Emit error for hsa
5642     return legalizeKernargMemParameter(MI, B,
5643                                        SI::KernelInputOffsets::NGROUPS_X);
5644   case Intrinsic::r600_read_ngroups_y:
5645     return legalizeKernargMemParameter(MI, B,
5646                                        SI::KernelInputOffsets::NGROUPS_Y);
5647   case Intrinsic::r600_read_ngroups_z:
5648     return legalizeKernargMemParameter(MI, B,
5649                                        SI::KernelInputOffsets::NGROUPS_Z);
5650   case Intrinsic::r600_read_local_size_x:
5651     // TODO: Could insert G_ASSERT_ZEXT from s16
5652     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5653   case Intrinsic::r600_read_local_size_y:
5654     // TODO: Could insert G_ASSERT_ZEXT from s16
5655     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
5656     // TODO: Could insert G_ASSERT_ZEXT from s16
5657   case Intrinsic::r600_read_local_size_z:
5658     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5659   case Intrinsic::r600_read_global_size_x:
5660     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5661   case Intrinsic::r600_read_global_size_y:
5662     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5663   case Intrinsic::r600_read_global_size_z:
5664     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5665   case Intrinsic::amdgcn_fdiv_fast:
5666     return legalizeFDIVFastIntrin(MI, MRI, B);
5667   case Intrinsic::amdgcn_is_shared:
5668     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5669   case Intrinsic::amdgcn_is_private:
5670     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5671   case Intrinsic::amdgcn_wavefrontsize: {
5672     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5673     MI.eraseFromParent();
5674     return true;
5675   }
5676   case Intrinsic::amdgcn_s_buffer_load:
5677     return legalizeSBufferLoad(Helper, MI);
5678   case Intrinsic::amdgcn_raw_buffer_store:
5679   case Intrinsic::amdgcn_struct_buffer_store:
5680     return legalizeBufferStore(MI, MRI, B, false, false);
5681   case Intrinsic::amdgcn_raw_buffer_store_format:
5682   case Intrinsic::amdgcn_struct_buffer_store_format:
5683     return legalizeBufferStore(MI, MRI, B, false, true);
5684   case Intrinsic::amdgcn_raw_tbuffer_store:
5685   case Intrinsic::amdgcn_struct_tbuffer_store:
5686     return legalizeBufferStore(MI, MRI, B, true, true);
5687   case Intrinsic::amdgcn_raw_buffer_load:
5688   case Intrinsic::amdgcn_struct_buffer_load:
5689     return legalizeBufferLoad(MI, MRI, B, false, false);
5690   case Intrinsic::amdgcn_raw_buffer_load_format:
5691   case Intrinsic::amdgcn_struct_buffer_load_format:
5692     return legalizeBufferLoad(MI, MRI, B, true, false);
5693   case Intrinsic::amdgcn_raw_tbuffer_load:
5694   case Intrinsic::amdgcn_struct_tbuffer_load:
5695     return legalizeBufferLoad(MI, MRI, B, true, true);
5696   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5697   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5698   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5699   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5700   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5701   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5702   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5703   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5704   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5705   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5706   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5707   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5708   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5709   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5710   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5711   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5712   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5713   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5714   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5715   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5716   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5717   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5718   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5719   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5720   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5721   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5722   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5723   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5724   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5725   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5726     return legalizeBufferAtomic(MI, B, IntrID);
5727   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5728   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5729     Register DstReg = MI.getOperand(0).getReg();
5730     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5731       Function &F = B.getMF().getFunction();
5732       DiagnosticInfoUnsupported NoFpRet(
5733           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5734           DS_Error);
5735       F.getContext().diagnose(NoFpRet);
5736       B.buildUndef(DstReg);
5737       MI.eraseFromParent();
5738       return true;
5739     }
5740 
5741     return legalizeBufferAtomic(MI, B, IntrID);
5742   }
5743   case Intrinsic::amdgcn_atomic_inc:
5744     return legalizeAtomicIncDec(MI, B, true);
5745   case Intrinsic::amdgcn_atomic_dec:
5746     return legalizeAtomicIncDec(MI, B, false);
5747   case Intrinsic::trap:
5748     return legalizeTrapIntrinsic(MI, MRI, B);
5749   case Intrinsic::debugtrap:
5750     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5751   case Intrinsic::amdgcn_rsq_clamp:
5752     return legalizeRsqClampIntrinsic(MI, MRI, B);
5753   case Intrinsic::amdgcn_ds_fadd:
5754   case Intrinsic::amdgcn_ds_fmin:
5755   case Intrinsic::amdgcn_ds_fmax:
5756     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5757   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5758     return legalizeBVHIntrinsic(MI, B);
5759   default: {
5760     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5761             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5762       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5763     return true;
5764   }
5765   }
5766 
5767   return true;
5768 }
5769