1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx,
100                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(
112         TypeIdx,
113         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136   const unsigned Size = Ty.getSizeInBits();
137 
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
843       .customFor({S16, S32})
844       .scalarize(0)
845       .lower();
846 
847   // Lower roundeven into G_FRINT
848   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
849     .scalarize(0)
850     .lower();
851 
852   if (ST.has16BitInsts()) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S16, S32, S64})
855       .clampScalar(0, S16, S64)
856       .scalarize(0);
857   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32, S64})
860       .clampScalar(0, S32, S64)
861       .scalarize(0);
862   } else {
863     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
864       .legalFor({S32})
865       .customFor({S64})
866       .clampScalar(0, S32, S64)
867       .scalarize(0);
868   }
869 
870   getActionDefinitionsBuilder(G_PTR_ADD)
871     .legalIf(all(isPointer(0), sameSize(0, 1)))
872     .scalarize(0)
873     .scalarSameSizeAs(1, 0);
874 
875   getActionDefinitionsBuilder(G_PTRMASK)
876     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
877     .scalarSameSizeAs(1, 0)
878     .scalarize(0);
879 
880   auto &CmpBuilder =
881     getActionDefinitionsBuilder(G_ICMP)
882     // The compare output type differs based on the register bank of the output,
883     // so make both s1 and s32 legal.
884     //
885     // Scalar compares producing output in scc will be promoted to s32, as that
886     // is the allocatable register type that will be needed for the copy from
887     // scc. This will be promoted during RegBankSelect, and we assume something
888     // before that won't try to use s32 result types.
889     //
890     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
891     // bank.
892     .legalForCartesianProduct(
893       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
894     .legalForCartesianProduct(
895       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
896   if (ST.has16BitInsts()) {
897     CmpBuilder.legalFor({{S1, S16}});
898   }
899 
900   CmpBuilder
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0)
904     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
905 
906   getActionDefinitionsBuilder(G_FCMP)
907     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
908     .widenScalarToNextPow2(1)
909     .clampScalar(1, S32, S64)
910     .scalarize(0);
911 
912   // FIXME: fpow has a selection pattern that should move to custom lowering.
913   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
914   if (ST.has16BitInsts())
915     Exp2Ops.legalFor({S32, S16});
916   else
917     Exp2Ops.legalFor({S32});
918   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
919   Exp2Ops.scalarize(0);
920 
921   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
922   if (ST.has16BitInsts())
923     ExpOps.customFor({{S32}, {S16}});
924   else
925     ExpOps.customFor({S32});
926   ExpOps.clampScalar(0, MinScalarFPTy, S32)
927         .scalarize(0);
928 
929   getActionDefinitionsBuilder(G_FPOWI)
930     .clampScalar(0, MinScalarFPTy, S32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder(G_CTPOP)
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .widenScalarToNextPow2(1, 32)
938     .clampScalar(1, S32, S64)
939     .scalarize(0)
940     .widenScalarToNextPow2(0, 32);
941 
942 
943   // The hardware instructions return a different result on 0 than the generic
944   // instructions expect. The hardware produces -1, but these produce the
945   // bitwidth.
946   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
947     .scalarize(0)
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .widenScalarToNextPow2(0, 32)
951     .widenScalarToNextPow2(1, 32)
952     .custom();
953 
954   // The 64-bit versions produce 32-bit results, but only on the SALU.
955   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
956     .legalFor({{S32, S32}, {S32, S64}})
957     .clampScalar(0, S32, S32)
958     .clampScalar(1, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0, 32)
961     .widenScalarToNextPow2(1, 32);
962 
963   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
964   // RegBankSelect.
965   getActionDefinitionsBuilder(G_BITREVERSE)
966     .legalFor({S32, S64})
967     .clampScalar(0, S32, S64)
968     .scalarize(0)
969     .widenScalarToNextPow2(0);
970 
971   if (ST.has16BitInsts()) {
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S16, S32, V2S16})
974       .clampMaxNumElementsStrict(0, S16, 2)
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .clampScalar(0, S16, S32)
979       .scalarize(0);
980 
981     if (ST.hasVOP3PInsts()) {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16, V2S16})
984         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
985         .clampMaxNumElements(0, S16, 2)
986         .minScalar(0, S16)
987         .widenScalarToNextPow2(0)
988         .scalarize(0)
989         .lower();
990     } else {
991       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
992         .legalFor({S32, S16})
993         .widenScalarToNextPow2(0)
994         .minScalar(0, S16)
995         .scalarize(0)
996         .lower();
997     }
998   } else {
999     // TODO: Should have same legality without v_perm_b32
1000     getActionDefinitionsBuilder(G_BSWAP)
1001       .legalFor({S32})
1002       .lowerIf(scalarNarrowerThan(0, 32))
1003       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004       // narrowScalar limitation.
1005       .widenScalarToNextPow2(0)
1006       .maxScalar(0, S32)
1007       .scalarize(0)
1008       .lower();
1009 
1010     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011       .legalFor({S32})
1012       .minScalar(0, S32)
1013       .widenScalarToNextPow2(0)
1014       .scalarize(0)
1015       .lower();
1016   }
1017 
1018   getActionDefinitionsBuilder(G_INTTOPTR)
1019     // List the common cases
1020     .legalForCartesianProduct(AddrSpaces64, {S64})
1021     .legalForCartesianProduct(AddrSpaces32, {S32})
1022     .scalarize(0)
1023     // Accept any address space as long as the size matches
1024     .legalIf(sameSize(0, 1))
1025     .widenScalarIf(smallerThan(1, 0),
1026       [](const LegalityQuery &Query) {
1027         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1028       })
1029     .narrowScalarIf(largerThan(1, 0),
1030       [](const LegalityQuery &Query) {
1031         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1032       });
1033 
1034   getActionDefinitionsBuilder(G_PTRTOINT)
1035     // List the common cases
1036     .legalForCartesianProduct(AddrSpaces64, {S64})
1037     .legalForCartesianProduct(AddrSpaces32, {S32})
1038     .scalarize(0)
1039     // Accept any address space as long as the size matches
1040     .legalIf(sameSize(0, 1))
1041     .widenScalarIf(smallerThan(0, 1),
1042       [](const LegalityQuery &Query) {
1043         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044       })
1045     .narrowScalarIf(
1046       largerThan(0, 1),
1047       [](const LegalityQuery &Query) {
1048         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1049       });
1050 
1051   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1052     .scalarize(0)
1053     .custom();
1054 
1055   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1056                                     bool IsLoad) -> bool {
1057     const LLT DstTy = Query.Types[0];
1058 
1059     // Split vector extloads.
1060     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1061 
1062     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1063       return true;
1064 
1065     const LLT PtrTy = Query.Types[1];
1066     unsigned AS = PtrTy.getAddressSpace();
1067     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1068       return true;
1069 
1070     // Catch weird sized loads that don't evenly divide into the access sizes
1071     // TODO: May be able to widen depending on alignment etc.
1072     unsigned NumRegs = (MemSize + 31) / 32;
1073     if (NumRegs == 3) {
1074       if (!ST.hasDwordx3LoadStores())
1075         return true;
1076     } else {
1077       // If the alignment allows, these should have been widened.
1078       if (!isPowerOf2_32(NumRegs))
1079         return true;
1080     }
1081 
1082     return false;
1083   };
1084 
1085   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1086   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1087   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1088 
1089   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1090   // LDS
1091   // TODO: Unsupported flat for SI.
1092 
1093   for (unsigned Op : {G_LOAD, G_STORE}) {
1094     const bool IsStore = Op == G_STORE;
1095 
1096     auto &Actions = getActionDefinitionsBuilder(Op);
1097     // Explicitly list some common cases.
1098     // TODO: Does this help compile time at all?
1099     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1100                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1101                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1102                                       {S64, GlobalPtr, S64, GlobalAlign32},
1103                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1104                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1105                                       {S32, GlobalPtr, S8, GlobalAlign8},
1106                                       {S32, GlobalPtr, S16, GlobalAlign16},
1107 
1108                                       {S32, LocalPtr, S32, 32},
1109                                       {S64, LocalPtr, S64, 32},
1110                                       {V2S32, LocalPtr, V2S32, 32},
1111                                       {S32, LocalPtr, S8, 8},
1112                                       {S32, LocalPtr, S16, 16},
1113                                       {V2S16, LocalPtr, S32, 32},
1114 
1115                                       {S32, PrivatePtr, S32, 32},
1116                                       {S32, PrivatePtr, S8, 8},
1117                                       {S32, PrivatePtr, S16, 16},
1118                                       {V2S16, PrivatePtr, S32, 32},
1119 
1120                                       {S32, ConstantPtr, S32, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1122                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1123                                       {S64, ConstantPtr, S64, GlobalAlign32},
1124                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1125     Actions.legalIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return isLoadStoreLegal(ST, Query);
1128       });
1129 
1130     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1131     // 64-bits.
1132     //
1133     // TODO: Should generalize bitcast action into coerce, which will also cover
1134     // inserting addrspacecasts.
1135     Actions.customIf(typeIs(1, Constant32Ptr));
1136 
1137     // Turn any illegal element vectors into something easier to deal
1138     // with. These will ultimately produce 32-bit scalar shifts to extract the
1139     // parts anyway.
1140     //
1141     // For odd 16-bit element vectors, prefer to split those into pieces with
1142     // 16-bit vector parts.
1143     Actions.bitcastIf(
1144       [=](const LegalityQuery &Query) -> bool {
1145         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1146                                           Query.MMODescrs[0].MemoryTy);
1147       }, bitcastToRegisterType(0));
1148 
1149     if (!IsStore) {
1150       // Widen suitably aligned loads by loading extra bytes. The standard
1151       // legalization actions can't properly express widening memory operands.
1152       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1153         return shouldWidenLoad(ST, Query, G_LOAD);
1154       });
1155     }
1156 
1157     // FIXME: load/store narrowing should be moved to lower action
1158     Actions
1159         .narrowScalarIf(
1160             [=](const LegalityQuery &Query) -> bool {
1161               return !Query.Types[0].isVector() &&
1162                      needToSplitMemOp(Query, Op == G_LOAD);
1163             },
1164             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1165               const LLT DstTy = Query.Types[0];
1166               const LLT PtrTy = Query.Types[1];
1167 
1168               const unsigned DstSize = DstTy.getSizeInBits();
1169               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1170 
1171               // Split extloads.
1172               if (DstSize > MemSize)
1173                 return std::make_pair(0, LLT::scalar(MemSize));
1174 
1175               unsigned MaxSize = maxSizeForAddrSpace(ST,
1176                                                      PtrTy.getAddressSpace(),
1177                                                      Op == G_LOAD);
1178               if (MemSize > MaxSize)
1179                 return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1182               return std::make_pair(0, LLT::scalar(Align));
1183             })
1184         .fewerElementsIf(
1185             [=](const LegalityQuery &Query) -> bool {
1186               return Query.Types[0].isVector() &&
1187                      needToSplitMemOp(Query, Op == G_LOAD);
1188             },
1189             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190               const LLT DstTy = Query.Types[0];
1191               const LLT PtrTy = Query.Types[1];
1192 
1193               LLT EltTy = DstTy.getElementType();
1194               unsigned MaxSize = maxSizeForAddrSpace(ST,
1195                                                      PtrTy.getAddressSpace(),
1196                                                      Op == G_LOAD);
1197 
1198               // FIXME: Handle widened to power of 2 results better. This ends
1199               // up scalarizing.
1200               // FIXME: 3 element stores scalarized on SI
1201 
1202               // Split if it's too large for the address space.
1203               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1204               if (MemSize > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                       0, LLT::scalarOrVector(
1211                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1212                 }
1213 
1214                 unsigned NumPieces = MemSize / MaxSize;
1215 
1216                 // FIXME: Refine when odd breakdowns handled
1217                 // The scalars will need to be re-legalized.
1218                 if (NumPieces == 1 || NumPieces >= NumElts ||
1219                     NumElts % NumPieces != 0)
1220                   return std::make_pair(0, EltTy);
1221 
1222                 return std::make_pair(
1223                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1224               }
1225 
1226               // FIXME: We could probably handle weird extending loads better.
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                     0, LLT::scalarOrVector(
1239                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1240               }
1241 
1242               // May need relegalization for the scalars.
1243               return std::make_pair(0, EltTy);
1244             })
1245     .minScalar(0, S32)
1246     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1247     .widenScalarToNextPow2(0)
1248     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1249     .lower();
1250   }
1251 
1252   // FIXME: Unaligned accesses not lowered.
1253   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1254                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1255                                                   {S32, GlobalPtr, S16, 2 * 8},
1256                                                   {S32, LocalPtr, S8, 8},
1257                                                   {S32, LocalPtr, S16, 16},
1258                                                   {S32, PrivatePtr, S8, 8},
1259                                                   {S32, PrivatePtr, S16, 16},
1260                                                   {S32, ConstantPtr, S8, 8},
1261                                                   {S32, ConstantPtr, S16, 2 * 8}})
1262                        .legalIf(
1263                          [=](const LegalityQuery &Query) -> bool {
1264                            return isLoadStoreLegal(ST, Query);
1265                          });
1266 
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1270   }
1271 
1272   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1273   // 64-bits.
1274   //
1275   // TODO: Should generalize bitcast action into coerce, which will also cover
1276   // inserting addrspacecasts.
1277   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1278 
1279   ExtLoads.clampScalar(0, S32, S32)
1280           .widenScalarToNextPow2(0)
1281           .lower();
1282 
1283   auto &Atomics = getActionDefinitionsBuilder(
1284     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287      G_ATOMICRMW_UMIN})
1288     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289                {S64, GlobalPtr}, {S64, LocalPtr},
1290                {S32, RegionPtr}, {S64, RegionPtr}});
1291   if (ST.hasFlatAddressSpace()) {
1292     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293   }
1294 
1295   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296   if (ST.hasLDSFPAtomicAdd()) {
1297     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298     if (ST.hasGFX90AInsts())
1299       Atomic.legalFor({{S64, LocalPtr}});
1300     if (ST.hasGFX940Insts())
1301       Atomic.legalFor({{V2S16, LocalPtr}});
1302   }
1303   if (ST.hasAtomicFaddInsts())
1304     Atomic.legalFor({{S32, GlobalPtr}});
1305 
1306   if (ST.hasGFX90AInsts()) {
1307     // These are legal with some caveats, and should have undergone expansion in
1308     // the IR in most situations
1309     // TODO: Move atomic expansion into legalizer
1310     // TODO: Also supports <2 x f16>
1311     Atomic.legalFor({
1312         {S32, GlobalPtr},
1313         {S64, GlobalPtr},
1314         {S64, FlatPtr}
1315       });
1316   }
1317 
1318   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1319   // demarshalling
1320   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1321     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1322                 {S32, FlatPtr}, {S64, FlatPtr}})
1323     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1324                {S32, RegionPtr}, {S64, RegionPtr}});
1325   // TODO: Pointer types, any 32-bit or 64-bit vector
1326 
1327   // Condition should be s32 for scalar, s1 for vector.
1328   getActionDefinitionsBuilder(G_SELECT)
1329       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1330                                  LocalPtr, FlatPtr, PrivatePtr,
1331                                  LLT::fixed_vector(2, LocalPtr),
1332                                  LLT::fixed_vector(2, PrivatePtr)},
1333                                 {S1, S32})
1334       .clampScalar(0, S16, S64)
1335       .scalarize(1)
1336       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1337       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1338       .clampMaxNumElements(0, S32, 2)
1339       .clampMaxNumElements(0, LocalPtr, 2)
1340       .clampMaxNumElements(0, PrivatePtr, 2)
1341       .scalarize(0)
1342       .widenScalarToNextPow2(0)
1343       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1344 
1345   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1346   // be more flexible with the shift amount type.
1347   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1348     .legalFor({{S32, S32}, {S64, S32}});
1349   if (ST.has16BitInsts()) {
1350     if (ST.hasVOP3PInsts()) {
1351       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1352             .clampMaxNumElements(0, S16, 2);
1353     } else
1354       Shifts.legalFor({{S16, S16}});
1355 
1356     // TODO: Support 16-bit shift amounts for all types
1357     Shifts.widenScalarIf(
1358       [=](const LegalityQuery &Query) {
1359         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1360         // 32-bit amount.
1361         const LLT ValTy = Query.Types[0];
1362         const LLT AmountTy = Query.Types[1];
1363         return ValTy.getSizeInBits() <= 16 &&
1364                AmountTy.getSizeInBits() < 16;
1365       }, changeTo(1, S16));
1366     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1367     Shifts.clampScalar(1, S32, S32);
1368     Shifts.widenScalarToNextPow2(0, 16);
1369     Shifts.clampScalar(0, S16, S64);
1370 
1371     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1372       .minScalar(0, S16)
1373       .scalarize(0)
1374       .lower();
1375   } else {
1376     // Make sure we legalize the shift amount type first, as the general
1377     // expansion for the shifted type will produce much worse code if it hasn't
1378     // been truncated already.
1379     Shifts.clampScalar(1, S32, S32);
1380     Shifts.widenScalarToNextPow2(0, 32);
1381     Shifts.clampScalar(0, S32, S64);
1382 
1383     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384       .minScalar(0, S32)
1385       .scalarize(0)
1386       .lower();
1387   }
1388   Shifts.scalarize(0);
1389 
1390   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1391     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1392     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1393     unsigned IdxTypeIdx = 2;
1394 
1395     getActionDefinitionsBuilder(Op)
1396       .customIf([=](const LegalityQuery &Query) {
1397           const LLT EltTy = Query.Types[EltTypeIdx];
1398           const LLT VecTy = Query.Types[VecTypeIdx];
1399           const LLT IdxTy = Query.Types[IdxTypeIdx];
1400           const unsigned EltSize = EltTy.getSizeInBits();
1401           return (EltSize == 32 || EltSize == 64) &&
1402                   VecTy.getSizeInBits() % 32 == 0 &&
1403                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1404                   IdxTy.getSizeInBits() == 32;
1405         })
1406       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1407                  bitcastToVectorElement32(VecTypeIdx))
1408       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1409       .bitcastIf(
1410         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1411         [=](const LegalityQuery &Query) {
1412           // For > 64-bit element types, try to turn this into a 64-bit
1413           // element vector since we may be able to do better indexing
1414           // if this is scalar. If not, fall back to 32.
1415           const LLT EltTy = Query.Types[EltTypeIdx];
1416           const LLT VecTy = Query.Types[VecTypeIdx];
1417           const unsigned DstEltSize = EltTy.getSizeInBits();
1418           const unsigned VecSize = VecTy.getSizeInBits();
1419 
1420           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1421           return std::make_pair(
1422               VecTypeIdx,
1423               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1424         })
1425       .clampScalar(EltTypeIdx, S32, S64)
1426       .clampScalar(VecTypeIdx, S32, S64)
1427       .clampScalar(IdxTypeIdx, S32, S32)
1428       .clampMaxNumElements(VecTypeIdx, S32, 32)
1429       // TODO: Clamp elements for 64-bit vectors?
1430       // It should only be necessary with variable indexes.
1431       // As a last resort, lower to the stack
1432       .lower();
1433   }
1434 
1435   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436     .unsupportedIf([=](const LegalityQuery &Query) {
1437         const LLT &EltTy = Query.Types[1].getElementType();
1438         return Query.Types[0] != EltTy;
1439       });
1440 
1441   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445     // FIXME: Doesn't handle extract of illegal sizes.
1446     getActionDefinitionsBuilder(Op)
1447       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448       .lowerIf([=](const LegalityQuery &Query) {
1449           // Sub-vector(or single element) insert and extract.
1450           // TODO: verify immediate offset here since lower only works with
1451           // whole elements.
1452           const LLT BigTy = Query.Types[BigTyIdx];
1453           return BigTy.isVector();
1454         })
1455       // FIXME: Multiples of 16 should not be legal.
1456       .legalIf([=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           const LLT LitTy = Query.Types[LitTyIdx];
1459           return (BigTy.getSizeInBits() % 32 == 0) &&
1460                  (LitTy.getSizeInBits() % 16 == 0);
1461         })
1462       .widenScalarIf(
1463         [=](const LegalityQuery &Query) {
1464           const LLT BigTy = Query.Types[BigTyIdx];
1465           return (BigTy.getScalarSizeInBits() < 16);
1466         },
1467         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1468       .widenScalarIf(
1469         [=](const LegalityQuery &Query) {
1470           const LLT LitTy = Query.Types[LitTyIdx];
1471           return (LitTy.getScalarSizeInBits() < 16);
1472         },
1473         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1474       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1475       .widenScalarToNextPow2(BigTyIdx, 32);
1476 
1477   }
1478 
1479   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1480     .legalForCartesianProduct(AllS32Vectors, {S32})
1481     .legalForCartesianProduct(AllS64Vectors, {S64})
1482     .clampNumElements(0, V16S32, V32S32)
1483     .clampNumElements(0, V2S64, V16S64)
1484     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1485 
1486   if (ST.hasScalarPackInsts()) {
1487     BuildVector
1488       // FIXME: Should probably widen s1 vectors straight to s32
1489       .minScalarOrElt(0, S16)
1490       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1491       .minScalar(1, S32);
1492 
1493     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1494       .legalFor({V2S16, S32})
1495       .lower();
1496     BuildVector.minScalarOrElt(0, S32);
1497   } else {
1498     BuildVector.customFor({V2S16, S16});
1499     BuildVector.minScalarOrElt(0, S32);
1500 
1501     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1502       .customFor({V2S16, S32})
1503       .lower();
1504   }
1505 
1506   BuildVector.legalIf(isRegisterType(0));
1507 
1508   // FIXME: Clamp maximum size
1509   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1510     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1511     .clampMaxNumElements(0, S32, 32)
1512     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1513     .clampMaxNumElements(0, S16, 64);
1514 
1515   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1516   // pre-legalize.
1517   if (ST.hasVOP3PInsts()) {
1518     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1519       .customFor({V2S16, V2S16})
1520       .lower();
1521   } else
1522     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1523 
1524   // Merge/Unmerge
1525   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1526     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1527     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1528 
1529     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1530       const LLT Ty = Query.Types[TypeIdx];
1531       if (Ty.isVector()) {
1532         const LLT &EltTy = Ty.getElementType();
1533         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1534           return true;
1535         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1536           return true;
1537       }
1538       return false;
1539     };
1540 
1541     auto &Builder = getActionDefinitionsBuilder(Op)
1542       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1543       .lowerFor({{S16, V2S16}})
1544       .lowerIf([=](const LegalityQuery &Query) {
1545           const LLT BigTy = Query.Types[BigTyIdx];
1546           return BigTy.getSizeInBits() == 32;
1547         })
1548       // Try to widen to s16 first for small types.
1549       // TODO: Only do this on targets with legal s16 shifts
1550       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1551       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1552       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1553       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1554                            elementTypeIs(1, S16)),
1555                        changeTo(1, V2S16))
1556       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1557       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1558       // valid.
1559       .clampScalar(LitTyIdx, S32, S512)
1560       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1561       // Break up vectors with weird elements into scalars
1562       .fewerElementsIf(
1563         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1564         scalarize(0))
1565       .fewerElementsIf(
1566         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1567         scalarize(1))
1568       .clampScalar(BigTyIdx, S32, MaxScalar);
1569 
1570     if (Op == G_MERGE_VALUES) {
1571       Builder.widenScalarIf(
1572         // TODO: Use 16-bit shifts if legal for 8-bit values?
1573         [=](const LegalityQuery &Query) {
1574           const LLT Ty = Query.Types[LitTyIdx];
1575           return Ty.getSizeInBits() < 32;
1576         },
1577         changeTo(LitTyIdx, S32));
1578     }
1579 
1580     Builder.widenScalarIf(
1581       [=](const LegalityQuery &Query) {
1582         const LLT Ty = Query.Types[BigTyIdx];
1583         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1584           Ty.getSizeInBits() % 16 != 0;
1585       },
1586       [=](const LegalityQuery &Query) {
1587         // Pick the next power of 2, or a multiple of 64 over 128.
1588         // Whichever is smaller.
1589         const LLT &Ty = Query.Types[BigTyIdx];
1590         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1591         if (NewSizeInBits >= 256) {
1592           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1593           if (RoundedTo < NewSizeInBits)
1594             NewSizeInBits = RoundedTo;
1595         }
1596         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1597       })
1598       // Any vectors left are the wrong size. Scalarize them.
1599       .scalarize(0)
1600       .scalarize(1);
1601   }
1602 
1603   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1604   // RegBankSelect.
1605   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1606     .legalFor({{S32}, {S64}});
1607 
1608   if (ST.hasVOP3PInsts()) {
1609     SextInReg.lowerFor({{V2S16}})
1610       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1611       // get more vector shift opportunities, since we'll get those when
1612       // expanded.
1613       .clampMaxNumElementsStrict(0, S16, 2);
1614   } else if (ST.has16BitInsts()) {
1615     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1616   } else {
1617     // Prefer to promote to s32 before lowering if we don't have 16-bit
1618     // shifts. This avoid a lot of intermediate truncate and extend operations.
1619     SextInReg.lowerFor({{S32}, {S64}});
1620   }
1621 
1622   SextInReg
1623     .scalarize(0)
1624     .clampScalar(0, S32, S64)
1625     .lower();
1626 
1627   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1628     .scalarize(0)
1629     .lower();
1630 
1631   // TODO: Only Try to form v2s16 with legal packed instructions.
1632   getActionDefinitionsBuilder(G_FSHR)
1633     .legalFor({{S32, S32}})
1634     .lowerFor({{V2S16, V2S16}})
1635     .clampMaxNumElementsStrict(0, S16, 2)
1636     .scalarize(0)
1637     .lower();
1638 
1639   if (ST.hasVOP3PInsts()) {
1640     getActionDefinitionsBuilder(G_FSHL)
1641       .lowerFor({{V2S16, V2S16}})
1642       .clampMaxNumElementsStrict(0, S16, 2)
1643       .scalarize(0)
1644       .lower();
1645   } else {
1646     getActionDefinitionsBuilder(G_FSHL)
1647       .scalarize(0)
1648       .lower();
1649   }
1650 
1651   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652     .legalFor({S64});
1653 
1654   getActionDefinitionsBuilder(G_FENCE)
1655     .alwaysLegal();
1656 
1657   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658       .scalarize(0)
1659       .minScalar(0, S32)
1660       .lower();
1661 
1662   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663       .legalFor({{S32, S32}, {S64, S32}})
1664       .clampScalar(1, S32, S32)
1665       .clampScalar(0, S32, S64)
1666       .widenScalarToNextPow2(0)
1667       .scalarize(0);
1668 
1669   getActionDefinitionsBuilder({
1670       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671       G_FCOPYSIGN,
1672 
1673       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674       G_ATOMICRMW_NAND,
1675       G_ATOMICRMW_FSUB,
1676       G_READ_REGISTER,
1677       G_WRITE_REGISTER,
1678 
1679       G_SADDO, G_SSUBO,
1680 
1681        // TODO: Implement
1682       G_FMINIMUM, G_FMAXIMUM}).lower();
1683 
1684   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1685       .lower();
1686 
1687   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1688         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1689         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1690     .unsupported();
1691 
1692   getLegacyLegalizerInfo().computeTables();
1693   verify(*ST.getInstrInfo());
1694 }
1695 
1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1697                                          MachineInstr &MI) const {
1698   MachineIRBuilder &B = Helper.MIRBuilder;
1699   MachineRegisterInfo &MRI = *B.getMRI();
1700 
1701   switch (MI.getOpcode()) {
1702   case TargetOpcode::G_ADDRSPACE_CAST:
1703     return legalizeAddrSpaceCast(MI, MRI, B);
1704   case TargetOpcode::G_FRINT:
1705     return legalizeFrint(MI, MRI, B);
1706   case TargetOpcode::G_FCEIL:
1707     return legalizeFceil(MI, MRI, B);
1708   case TargetOpcode::G_FREM:
1709     return legalizeFrem(MI, MRI, B);
1710   case TargetOpcode::G_INTRINSIC_TRUNC:
1711     return legalizeIntrinsicTrunc(MI, MRI, B);
1712   case TargetOpcode::G_SITOFP:
1713     return legalizeITOFP(MI, MRI, B, true);
1714   case TargetOpcode::G_UITOFP:
1715     return legalizeITOFP(MI, MRI, B, false);
1716   case TargetOpcode::G_FPTOSI:
1717     return legalizeFPTOI(MI, MRI, B, true);
1718   case TargetOpcode::G_FPTOUI:
1719     return legalizeFPTOI(MI, MRI, B, false);
1720   case TargetOpcode::G_FMINNUM:
1721   case TargetOpcode::G_FMAXNUM:
1722   case TargetOpcode::G_FMINNUM_IEEE:
1723   case TargetOpcode::G_FMAXNUM_IEEE:
1724     return legalizeMinNumMaxNum(Helper, MI);
1725   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1726     return legalizeExtractVectorElt(MI, MRI, B);
1727   case TargetOpcode::G_INSERT_VECTOR_ELT:
1728     return legalizeInsertVectorElt(MI, MRI, B);
1729   case TargetOpcode::G_SHUFFLE_VECTOR:
1730     return legalizeShuffleVector(MI, MRI, B);
1731   case TargetOpcode::G_FSIN:
1732   case TargetOpcode::G_FCOS:
1733     return legalizeSinCos(MI, MRI, B);
1734   case TargetOpcode::G_GLOBAL_VALUE:
1735     return legalizeGlobalValue(MI, MRI, B);
1736   case TargetOpcode::G_LOAD:
1737   case TargetOpcode::G_SEXTLOAD:
1738   case TargetOpcode::G_ZEXTLOAD:
1739     return legalizeLoad(Helper, MI);
1740   case TargetOpcode::G_FMAD:
1741     return legalizeFMad(MI, MRI, B);
1742   case TargetOpcode::G_FDIV:
1743     return legalizeFDIV(MI, MRI, B);
1744   case TargetOpcode::G_UDIV:
1745   case TargetOpcode::G_UREM:
1746   case TargetOpcode::G_UDIVREM:
1747     return legalizeUnsignedDIV_REM(MI, MRI, B);
1748   case TargetOpcode::G_SDIV:
1749   case TargetOpcode::G_SREM:
1750   case TargetOpcode::G_SDIVREM:
1751     return legalizeSignedDIV_REM(MI, MRI, B);
1752   case TargetOpcode::G_ATOMIC_CMPXCHG:
1753     return legalizeAtomicCmpXChg(MI, MRI, B);
1754   case TargetOpcode::G_FLOG:
1755     return legalizeFlog(MI, B, numbers::ln2f);
1756   case TargetOpcode::G_FLOG10:
1757     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1758   case TargetOpcode::G_FEXP:
1759     return legalizeFExp(MI, B);
1760   case TargetOpcode::G_FPOW:
1761     return legalizeFPow(MI, B);
1762   case TargetOpcode::G_FFLOOR:
1763     return legalizeFFloor(MI, MRI, B);
1764   case TargetOpcode::G_BUILD_VECTOR:
1765     return legalizeBuildVector(MI, MRI, B);
1766   case TargetOpcode::G_CTLZ:
1767   case TargetOpcode::G_CTTZ:
1768     return legalizeCTLZ_CTTZ(MI, MRI, B);
1769   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1770     return legalizeFPTruncRound(MI, B);
1771   default:
1772     return false;
1773   }
1774 
1775   llvm_unreachable("expected switch to return");
1776 }
1777 
1778 Register AMDGPULegalizerInfo::getSegmentAperture(
1779   unsigned AS,
1780   MachineRegisterInfo &MRI,
1781   MachineIRBuilder &B) const {
1782   MachineFunction &MF = B.getMF();
1783   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1784   const LLT S32 = LLT::scalar(32);
1785 
1786   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1787 
1788   if (ST.hasApertureRegs()) {
1789     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1790     // getreg.
1791     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1792         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1793         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1794     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1795         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1796         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1797     unsigned Encoding =
1798         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1799         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1800         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1801 
1802     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1803 
1804     B.buildInstr(AMDGPU::S_GETREG_B32)
1805       .addDef(GetReg)
1806       .addImm(Encoding);
1807     MRI.setType(GetReg, S32);
1808 
1809     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1810     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1811   }
1812 
1813   // TODO: can we be smarter about machine pointer info?
1814   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1815   Register LoadAddr = MRI.createGenericVirtualRegister(
1816     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1817   // For code object version 5, private_base and shared_base are passed through
1818   // implicit kernargs.
1819   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1820     AMDGPUTargetLowering::ImplicitParameter Param =
1821         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1822                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1823     uint64_t Offset =
1824         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1825 
1826     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1827         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1828 
1829     if (!loadInputValue(KernargPtrReg, B,
1830                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1831       return Register();
1832 
1833     MachineMemOperand *MMO = MF.getMachineMemOperand(
1834         PtrInfo,
1835         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1836             MachineMemOperand::MOInvariant,
1837         LLT::scalar(32), commonAlignment(Align(64), Offset));
1838 
1839     // Pointer address
1840     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1841                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1842     // Load address
1843     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1844   }
1845 
1846   Register QueuePtr = MRI.createGenericVirtualRegister(
1847     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1848 
1849   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1850     return Register();
1851 
1852   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1853   // private_segment_aperture_base_hi.
1854   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1855 
1856   MachineMemOperand *MMO = MF.getMachineMemOperand(
1857       PtrInfo,
1858       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1859           MachineMemOperand::MOInvariant,
1860       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1861 
1862   B.buildPtrAdd(LoadAddr, QueuePtr,
1863                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1864   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1865 }
1866 
1867 /// Return true if the value is a known valid address, such that a null check is
1868 /// not necessary.
1869 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1870                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1871   MachineInstr *Def = MRI.getVRegDef(Val);
1872   switch (Def->getOpcode()) {
1873   case AMDGPU::G_FRAME_INDEX:
1874   case AMDGPU::G_GLOBAL_VALUE:
1875   case AMDGPU::G_BLOCK_ADDR:
1876     return true;
1877   case AMDGPU::G_CONSTANT: {
1878     const ConstantInt *CI = Def->getOperand(1).getCImm();
1879     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1880   }
1881   default:
1882     return false;
1883   }
1884 
1885   return false;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1889   MachineInstr &MI, MachineRegisterInfo &MRI,
1890   MachineIRBuilder &B) const {
1891   MachineFunction &MF = B.getMF();
1892 
1893   const LLT S32 = LLT::scalar(32);
1894   Register Dst = MI.getOperand(0).getReg();
1895   Register Src = MI.getOperand(1).getReg();
1896 
1897   LLT DstTy = MRI.getType(Dst);
1898   LLT SrcTy = MRI.getType(Src);
1899   unsigned DestAS = DstTy.getAddressSpace();
1900   unsigned SrcAS = SrcTy.getAddressSpace();
1901 
1902   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1903   // vector element.
1904   assert(!DstTy.isVector());
1905 
1906   const AMDGPUTargetMachine &TM
1907     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1908 
1909   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1910     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1911     return true;
1912   }
1913 
1914   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1915       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1916        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1917     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1918       // Extract low 32-bits of the pointer.
1919       B.buildExtract(Dst, Src, 0);
1920       MI.eraseFromParent();
1921       return true;
1922     }
1923 
1924     unsigned NullVal = TM.getNullPointerValue(DestAS);
1925 
1926     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1927     auto FlatNull = B.buildConstant(SrcTy, 0);
1928 
1929     // Extract low 32-bits of the pointer.
1930     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1931 
1932     auto CmpRes =
1933         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1934     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1935 
1936     MI.eraseFromParent();
1937     return true;
1938   }
1939 
1940   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1941       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1942        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1943     if (!ST.hasFlatAddressSpace())
1944       return false;
1945 
1946     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1947     if (!ApertureReg.isValid())
1948       return false;
1949 
1950     // Coerce the type of the low half of the result so we can use merge_values.
1951     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1952 
1953     // TODO: Should we allow mismatched types but matching sizes in merges to
1954     // avoid the ptrtoint?
1955     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1956 
1957     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1958       B.buildCopy(Dst, BuildPtr);
1959       MI.eraseFromParent();
1960       return true;
1961     }
1962 
1963     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1964     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1965 
1966     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1967                               SegmentNull.getReg(0));
1968 
1969     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1970 
1971     MI.eraseFromParent();
1972     return true;
1973   }
1974 
1975   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1976       SrcTy.getSizeInBits() == 64) {
1977     // Truncate.
1978     B.buildExtract(Dst, Src, 0);
1979     MI.eraseFromParent();
1980     return true;
1981   }
1982 
1983   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1984       DstTy.getSizeInBits() == 64) {
1985     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1986     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1987 
1988     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1989     // another. Merge operands are required to be the same type, but creating an
1990     // extra ptrtoint would be kind of pointless.
1991     auto HighAddr = B.buildConstant(
1992         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1993     B.buildMerge(Dst, {Src, HighAddr});
1994     MI.eraseFromParent();
1995     return true;
1996   }
1997 
1998   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1999       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2000 
2001   LLVMContext &Ctx = MF.getFunction().getContext();
2002   Ctx.diagnose(InvalidAddrSpaceCast);
2003   B.buildUndef(Dst);
2004   MI.eraseFromParent();
2005   return true;
2006 }
2007 
2008 bool AMDGPULegalizerInfo::legalizeFrint(
2009   MachineInstr &MI, MachineRegisterInfo &MRI,
2010   MachineIRBuilder &B) const {
2011   Register Src = MI.getOperand(1).getReg();
2012   LLT Ty = MRI.getType(Src);
2013   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2014 
2015   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2016   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2017 
2018   auto C1 = B.buildFConstant(Ty, C1Val);
2019   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2020 
2021   // TODO: Should this propagate fast-math-flags?
2022   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2023   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2024 
2025   auto C2 = B.buildFConstant(Ty, C2Val);
2026   auto Fabs = B.buildFAbs(Ty, Src);
2027 
2028   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2029   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2030   MI.eraseFromParent();
2031   return true;
2032 }
2033 
2034 bool AMDGPULegalizerInfo::legalizeFceil(
2035   MachineInstr &MI, MachineRegisterInfo &MRI,
2036   MachineIRBuilder &B) const {
2037 
2038   const LLT S1 = LLT::scalar(1);
2039   const LLT S64 = LLT::scalar(64);
2040 
2041   Register Src = MI.getOperand(1).getReg();
2042   assert(MRI.getType(Src) == S64);
2043 
2044   // result = trunc(src)
2045   // if (src > 0.0 && src != result)
2046   //   result += 1.0
2047 
2048   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2049 
2050   const auto Zero = B.buildFConstant(S64, 0.0);
2051   const auto One = B.buildFConstant(S64, 1.0);
2052   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2053   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2054   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2055   auto Add = B.buildSelect(S64, And, One, Zero);
2056 
2057   // TODO: Should this propagate fast-math-flags?
2058   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2059   MI.eraseFromParent();
2060   return true;
2061 }
2062 
2063 bool AMDGPULegalizerInfo::legalizeFrem(
2064   MachineInstr &MI, MachineRegisterInfo &MRI,
2065   MachineIRBuilder &B) const {
2066     Register DstReg = MI.getOperand(0).getReg();
2067     Register Src0Reg = MI.getOperand(1).getReg();
2068     Register Src1Reg = MI.getOperand(2).getReg();
2069     auto Flags = MI.getFlags();
2070     LLT Ty = MRI.getType(DstReg);
2071 
2072     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2073     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2074     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2075     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2076     MI.eraseFromParent();
2077     return true;
2078 }
2079 
2080 static MachineInstrBuilder extractF64Exponent(Register Hi,
2081                                               MachineIRBuilder &B) {
2082   const unsigned FractBits = 52;
2083   const unsigned ExpBits = 11;
2084   LLT S32 = LLT::scalar(32);
2085 
2086   auto Const0 = B.buildConstant(S32, FractBits - 32);
2087   auto Const1 = B.buildConstant(S32, ExpBits);
2088 
2089   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2090     .addUse(Hi)
2091     .addUse(Const0.getReg(0))
2092     .addUse(Const1.getReg(0));
2093 
2094   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2095 }
2096 
2097 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2098   MachineInstr &MI, MachineRegisterInfo &MRI,
2099   MachineIRBuilder &B) const {
2100   const LLT S1 = LLT::scalar(1);
2101   const LLT S32 = LLT::scalar(32);
2102   const LLT S64 = LLT::scalar(64);
2103 
2104   Register Src = MI.getOperand(1).getReg();
2105   assert(MRI.getType(Src) == S64);
2106 
2107   // TODO: Should this use extract since the low half is unused?
2108   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2109   Register Hi = Unmerge.getReg(1);
2110 
2111   // Extract the upper half, since this is where we will find the sign and
2112   // exponent.
2113   auto Exp = extractF64Exponent(Hi, B);
2114 
2115   const unsigned FractBits = 52;
2116 
2117   // Extract the sign bit.
2118   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2119   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2120 
2121   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2122 
2123   const auto Zero32 = B.buildConstant(S32, 0);
2124 
2125   // Extend back to 64-bits.
2126   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2127 
2128   auto Shr = B.buildAShr(S64, FractMask, Exp);
2129   auto Not = B.buildNot(S64, Shr);
2130   auto Tmp0 = B.buildAnd(S64, Src, Not);
2131   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2132 
2133   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2134   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2135 
2136   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2137   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2138   MI.eraseFromParent();
2139   return true;
2140 }
2141 
2142 bool AMDGPULegalizerInfo::legalizeITOFP(
2143   MachineInstr &MI, MachineRegisterInfo &MRI,
2144   MachineIRBuilder &B, bool Signed) const {
2145 
2146   Register Dst = MI.getOperand(0).getReg();
2147   Register Src = MI.getOperand(1).getReg();
2148 
2149   const LLT S64 = LLT::scalar(64);
2150   const LLT S32 = LLT::scalar(32);
2151 
2152   assert(MRI.getType(Src) == S64);
2153 
2154   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2155   auto ThirtyTwo = B.buildConstant(S32, 32);
2156 
2157   if (MRI.getType(Dst) == S64) {
2158     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2159                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2160 
2161     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2162     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2163                      .addUse(CvtHi.getReg(0))
2164                      .addUse(ThirtyTwo.getReg(0));
2165 
2166     // TODO: Should this propagate fast-math-flags?
2167     B.buildFAdd(Dst, LdExp, CvtLo);
2168     MI.eraseFromParent();
2169     return true;
2170   }
2171 
2172   assert(MRI.getType(Dst) == S32);
2173 
2174   auto One = B.buildConstant(S32, 1);
2175 
2176   MachineInstrBuilder ShAmt;
2177   if (Signed) {
2178     auto ThirtyOne = B.buildConstant(S32, 31);
2179     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2180     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2181     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2182     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2183                                /*HasSideEffects=*/false)
2184                   .addUse(Unmerge.getReg(1));
2185     auto LS2 = B.buildSub(S32, LS, One);
2186     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2187   } else
2188     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2189   auto Norm = B.buildShl(S64, Src, ShAmt);
2190   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2191   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2192   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2193   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2194   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2195   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2196                    /*HasSideEffects=*/false)
2197       .addUse(FVal.getReg(0))
2198       .addUse(Scale.getReg(0));
2199   MI.eraseFromParent();
2200   return true;
2201 }
2202 
2203 // TODO: Copied from DAG implementation. Verify logic and document how this
2204 // actually works.
2205 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2206                                         MachineRegisterInfo &MRI,
2207                                         MachineIRBuilder &B,
2208                                         bool Signed) const {
2209 
2210   Register Dst = MI.getOperand(0).getReg();
2211   Register Src = MI.getOperand(1).getReg();
2212 
2213   const LLT S64 = LLT::scalar(64);
2214   const LLT S32 = LLT::scalar(32);
2215 
2216   const LLT SrcLT = MRI.getType(Src);
2217   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2218 
2219   unsigned Flags = MI.getFlags();
2220 
2221   // The basic idea of converting a floating point number into a pair of 32-bit
2222   // integers is illustrated as follows:
2223   //
2224   //     tf := trunc(val);
2225   //    hif := floor(tf * 2^-32);
2226   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2227   //     hi := fptoi(hif);
2228   //     lo := fptoi(lof);
2229   //
2230   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2231   MachineInstrBuilder Sign;
2232   if (Signed && SrcLT == S32) {
2233     // However, a 32-bit floating point number has only 23 bits mantissa and
2234     // it's not enough to hold all the significant bits of `lof` if val is
2235     // negative. To avoid the loss of precision, We need to take the absolute
2236     // value after truncating and flip the result back based on the original
2237     // signedness.
2238     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2239     Trunc = B.buildFAbs(S32, Trunc, Flags);
2240   }
2241   MachineInstrBuilder K0, K1;
2242   if (SrcLT == S64) {
2243     K0 = B.buildFConstant(S64,
2244                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2245     K1 = B.buildFConstant(S64,
2246                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2247   } else {
2248     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2249     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2250   }
2251 
2252   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2253   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2254   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2255 
2256   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2257                                      : B.buildFPTOUI(S32, FloorMul);
2258   auto Lo = B.buildFPTOUI(S32, Fma);
2259 
2260   if (Signed && SrcLT == S32) {
2261     // Flip the result based on the signedness, which is either all 0s or 1s.
2262     Sign = B.buildMerge(S64, {Sign, Sign});
2263     // r := xor({lo, hi}, sign) - sign;
2264     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2265   } else
2266     B.buildMerge(Dst, {Lo, Hi});
2267   MI.eraseFromParent();
2268 
2269   return true;
2270 }
2271 
2272 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2273                                                MachineInstr &MI) const {
2274   MachineFunction &MF = Helper.MIRBuilder.getMF();
2275   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2276 
2277   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2278                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2279 
2280   // With ieee_mode disabled, the instructions have the correct behavior
2281   // already for G_FMINNUM/G_FMAXNUM
2282   if (!MFI->getMode().IEEE)
2283     return !IsIEEEOp;
2284 
2285   if (IsIEEEOp)
2286     return true;
2287 
2288   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2289 }
2290 
2291 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2292   MachineInstr &MI, MachineRegisterInfo &MRI,
2293   MachineIRBuilder &B) const {
2294   // TODO: Should move some of this into LegalizerHelper.
2295 
2296   // TODO: Promote dynamic indexing of s16 to s32
2297 
2298   // FIXME: Artifact combiner probably should have replaced the truncated
2299   // constant before this, so we shouldn't need
2300   // getIConstantVRegValWithLookThrough.
2301   Optional<ValueAndVReg> MaybeIdxVal =
2302       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2303   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2304     return true;
2305   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2306 
2307   Register Dst = MI.getOperand(0).getReg();
2308   Register Vec = MI.getOperand(1).getReg();
2309 
2310   LLT VecTy = MRI.getType(Vec);
2311   LLT EltTy = VecTy.getElementType();
2312   assert(EltTy == MRI.getType(Dst));
2313 
2314   if (IdxVal < VecTy.getNumElements()) {
2315     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2316     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2317   } else {
2318     B.buildUndef(Dst);
2319   }
2320 
2321   MI.eraseFromParent();
2322   return true;
2323 }
2324 
2325 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2326   MachineInstr &MI, MachineRegisterInfo &MRI,
2327   MachineIRBuilder &B) const {
2328   // TODO: Should move some of this into LegalizerHelper.
2329 
2330   // TODO: Promote dynamic indexing of s16 to s32
2331 
2332   // FIXME: Artifact combiner probably should have replaced the truncated
2333   // constant before this, so we shouldn't need
2334   // getIConstantVRegValWithLookThrough.
2335   Optional<ValueAndVReg> MaybeIdxVal =
2336       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2337   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2338     return true;
2339 
2340   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2341   Register Dst = MI.getOperand(0).getReg();
2342   Register Vec = MI.getOperand(1).getReg();
2343   Register Ins = MI.getOperand(2).getReg();
2344 
2345   LLT VecTy = MRI.getType(Vec);
2346   LLT EltTy = VecTy.getElementType();
2347   assert(EltTy == MRI.getType(Ins));
2348   (void)Ins;
2349 
2350   unsigned NumElts = VecTy.getNumElements();
2351   if (IdxVal < NumElts) {
2352     SmallVector<Register, 8> SrcRegs;
2353     for (unsigned i = 0; i < NumElts; ++i)
2354       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2355     B.buildUnmerge(SrcRegs, Vec);
2356 
2357     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2358     B.buildMerge(Dst, SrcRegs);
2359   } else {
2360     B.buildUndef(Dst);
2361   }
2362 
2363   MI.eraseFromParent();
2364   return true;
2365 }
2366 
2367 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2368   MachineInstr &MI, MachineRegisterInfo &MRI,
2369   MachineIRBuilder &B) const {
2370   const LLT V2S16 = LLT::fixed_vector(2, 16);
2371 
2372   Register Dst = MI.getOperand(0).getReg();
2373   Register Src0 = MI.getOperand(1).getReg();
2374   LLT DstTy = MRI.getType(Dst);
2375   LLT SrcTy = MRI.getType(Src0);
2376 
2377   if (SrcTy == V2S16 && DstTy == V2S16 &&
2378       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2379     return true;
2380 
2381   MachineIRBuilder HelperBuilder(MI);
2382   GISelObserverWrapper DummyObserver;
2383   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2384   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2385 }
2386 
2387 bool AMDGPULegalizerInfo::legalizeSinCos(
2388   MachineInstr &MI, MachineRegisterInfo &MRI,
2389   MachineIRBuilder &B) const {
2390 
2391   Register DstReg = MI.getOperand(0).getReg();
2392   Register SrcReg = MI.getOperand(1).getReg();
2393   LLT Ty = MRI.getType(DstReg);
2394   unsigned Flags = MI.getFlags();
2395 
2396   Register TrigVal;
2397   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2398   if (ST.hasTrigReducedRange()) {
2399     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2400     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2401       .addUse(MulVal.getReg(0))
2402       .setMIFlags(Flags).getReg(0);
2403   } else
2404     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2405 
2406   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2407     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2408   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2409     .addUse(TrigVal)
2410     .setMIFlags(Flags);
2411   MI.eraseFromParent();
2412   return true;
2413 }
2414 
2415 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2416                                                   MachineIRBuilder &B,
2417                                                   const GlobalValue *GV,
2418                                                   int64_t Offset,
2419                                                   unsigned GAFlags) const {
2420   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2421   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2422   // to the following code sequence:
2423   //
2424   // For constant address space:
2425   //   s_getpc_b64 s[0:1]
2426   //   s_add_u32 s0, s0, $symbol
2427   //   s_addc_u32 s1, s1, 0
2428   //
2429   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2430   //   a fixup or relocation is emitted to replace $symbol with a literal
2431   //   constant, which is a pc-relative offset from the encoding of the $symbol
2432   //   operand to the global variable.
2433   //
2434   // For global address space:
2435   //   s_getpc_b64 s[0:1]
2436   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2437   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2438   //
2439   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2440   //   fixups or relocations are emitted to replace $symbol@*@lo and
2441   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2442   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2443   //   operand to the global variable.
2444   //
2445   // What we want here is an offset from the value returned by s_getpc
2446   // (which is the address of the s_add_u32 instruction) to the global
2447   // variable, but since the encoding of $symbol starts 4 bytes after the start
2448   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2449   // small. This requires us to add 4 to the global variable offset in order to
2450   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2451   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2452   // instruction.
2453 
2454   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2455 
2456   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2457     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2458 
2459   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2460     .addDef(PCReg);
2461 
2462   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2463   if (GAFlags == SIInstrInfo::MO_NONE)
2464     MIB.addImm(0);
2465   else
2466     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2467 
2468   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2469 
2470   if (PtrTy.getSizeInBits() == 32)
2471     B.buildExtract(DstReg, PCReg, 0);
2472   return true;
2473  }
2474 
2475 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2476   MachineInstr &MI, MachineRegisterInfo &MRI,
2477   MachineIRBuilder &B) const {
2478   Register DstReg = MI.getOperand(0).getReg();
2479   LLT Ty = MRI.getType(DstReg);
2480   unsigned AS = Ty.getAddressSpace();
2481 
2482   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2483   MachineFunction &MF = B.getMF();
2484   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2485 
2486   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2487     if (!MFI->isModuleEntryFunction() &&
2488         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2489       const Function &Fn = MF.getFunction();
2490       DiagnosticInfoUnsupported BadLDSDecl(
2491         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2492         DS_Warning);
2493       Fn.getContext().diagnose(BadLDSDecl);
2494 
2495       // We currently don't have a way to correctly allocate LDS objects that
2496       // aren't directly associated with a kernel. We do force inlining of
2497       // functions that use local objects. However, if these dead functions are
2498       // not eliminated, we don't want a compile time error. Just emit a warning
2499       // and a trap, since there should be no callable path here.
2500       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2501       B.buildUndef(DstReg);
2502       MI.eraseFromParent();
2503       return true;
2504     }
2505 
2506     // TODO: We could emit code to handle the initialization somewhere.
2507     // We ignore the initializer for now and legalize it to allow selection.
2508     // The initializer will anyway get errored out during assembly emission.
2509     const SITargetLowering *TLI = ST.getTargetLowering();
2510     if (!TLI->shouldUseLDSConstAddress(GV)) {
2511       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2512       return true; // Leave in place;
2513     }
2514 
2515     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2516       Type *Ty = GV->getValueType();
2517       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2518       // zero-sized type in other languages to declare the dynamic shared
2519       // memory which size is not known at the compile time. They will be
2520       // allocated by the runtime and placed directly after the static
2521       // allocated ones. They all share the same offset.
2522       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2523         // Adjust alignment for that dynamic shared memory array.
2524         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2525         LLT S32 = LLT::scalar(32);
2526         auto Sz =
2527             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2528         B.buildIntToPtr(DstReg, Sz);
2529         MI.eraseFromParent();
2530         return true;
2531       }
2532     }
2533 
2534     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2535                                                    *cast<GlobalVariable>(GV)));
2536     MI.eraseFromParent();
2537     return true;
2538   }
2539 
2540   const SITargetLowering *TLI = ST.getTargetLowering();
2541 
2542   if (TLI->shouldEmitFixup(GV)) {
2543     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2544     MI.eraseFromParent();
2545     return true;
2546   }
2547 
2548   if (TLI->shouldEmitPCReloc(GV)) {
2549     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2550     MI.eraseFromParent();
2551     return true;
2552   }
2553 
2554   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2555   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2556 
2557   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2558   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2559       MachinePointerInfo::getGOT(MF),
2560       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2561           MachineMemOperand::MOInvariant,
2562       LoadTy, Align(8));
2563 
2564   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2565 
2566   if (Ty.getSizeInBits() == 32) {
2567     // Truncate if this is a 32-bit constant address.
2568     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2569     B.buildExtract(DstReg, Load, 0);
2570   } else
2571     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2572 
2573   MI.eraseFromParent();
2574   return true;
2575 }
2576 
2577 static LLT widenToNextPowerOf2(LLT Ty) {
2578   if (Ty.isVector())
2579     return Ty.changeElementCount(
2580         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2581   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2582 }
2583 
2584 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2585                                        MachineInstr &MI) const {
2586   MachineIRBuilder &B = Helper.MIRBuilder;
2587   MachineRegisterInfo &MRI = *B.getMRI();
2588   GISelChangeObserver &Observer = Helper.Observer;
2589 
2590   Register PtrReg = MI.getOperand(1).getReg();
2591   LLT PtrTy = MRI.getType(PtrReg);
2592   unsigned AddrSpace = PtrTy.getAddressSpace();
2593 
2594   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2595     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2596     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2597     Observer.changingInstr(MI);
2598     MI.getOperand(1).setReg(Cast.getReg(0));
2599     Observer.changedInstr(MI);
2600     return true;
2601   }
2602 
2603   if (MI.getOpcode() != AMDGPU::G_LOAD)
2604     return false;
2605 
2606   Register ValReg = MI.getOperand(0).getReg();
2607   LLT ValTy = MRI.getType(ValReg);
2608 
2609   MachineMemOperand *MMO = *MI.memoperands_begin();
2610   const unsigned ValSize = ValTy.getSizeInBits();
2611   const LLT MemTy = MMO->getMemoryType();
2612   const Align MemAlign = MMO->getAlign();
2613   const unsigned MemSize = MemTy.getSizeInBits();
2614   const uint64_t AlignInBits = 8 * MemAlign.value();
2615 
2616   // Widen non-power-of-2 loads to the alignment if needed
2617   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2618     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2619 
2620     // This was already the correct extending load result type, so just adjust
2621     // the memory type.
2622     if (WideMemSize == ValSize) {
2623       MachineFunction &MF = B.getMF();
2624 
2625       MachineMemOperand *WideMMO =
2626           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2627       Observer.changingInstr(MI);
2628       MI.setMemRefs(MF, {WideMMO});
2629       Observer.changedInstr(MI);
2630       return true;
2631     }
2632 
2633     // Don't bother handling edge case that should probably never be produced.
2634     if (ValSize > WideMemSize)
2635       return false;
2636 
2637     LLT WideTy = widenToNextPowerOf2(ValTy);
2638 
2639     Register WideLoad;
2640     if (!WideTy.isVector()) {
2641       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2642       B.buildTrunc(ValReg, WideLoad).getReg(0);
2643     } else {
2644       // Extract the subvector.
2645 
2646       if (isRegisterType(ValTy)) {
2647         // If this a case where G_EXTRACT is legal, use it.
2648         // (e.g. <3 x s32> -> <4 x s32>)
2649         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2650         B.buildExtract(ValReg, WideLoad, 0);
2651       } else {
2652         // For cases where the widened type isn't a nice register value, unmerge
2653         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2654         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2655         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2656       }
2657     }
2658 
2659     MI.eraseFromParent();
2660     return true;
2661   }
2662 
2663   return false;
2664 }
2665 
2666 bool AMDGPULegalizerInfo::legalizeFMad(
2667   MachineInstr &MI, MachineRegisterInfo &MRI,
2668   MachineIRBuilder &B) const {
2669   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2670   assert(Ty.isScalar());
2671 
2672   MachineFunction &MF = B.getMF();
2673   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2674 
2675   // TODO: Always legal with future ftz flag.
2676   // FIXME: Do we need just output?
2677   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2678     return true;
2679   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2680     return true;
2681 
2682   MachineIRBuilder HelperBuilder(MI);
2683   GISelObserverWrapper DummyObserver;
2684   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2685   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2686 }
2687 
2688 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2689   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2690   Register DstReg = MI.getOperand(0).getReg();
2691   Register PtrReg = MI.getOperand(1).getReg();
2692   Register CmpVal = MI.getOperand(2).getReg();
2693   Register NewVal = MI.getOperand(3).getReg();
2694 
2695   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2696          "this should not have been custom lowered");
2697 
2698   LLT ValTy = MRI.getType(CmpVal);
2699   LLT VecTy = LLT::fixed_vector(2, ValTy);
2700 
2701   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2702 
2703   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2704     .addDef(DstReg)
2705     .addUse(PtrReg)
2706     .addUse(PackedVal)
2707     .setMemRefs(MI.memoperands());
2708 
2709   MI.eraseFromParent();
2710   return true;
2711 }
2712 
2713 bool AMDGPULegalizerInfo::legalizeFlog(
2714   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2715   Register Dst = MI.getOperand(0).getReg();
2716   Register Src = MI.getOperand(1).getReg();
2717   LLT Ty = B.getMRI()->getType(Dst);
2718   unsigned Flags = MI.getFlags();
2719 
2720   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2721   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2722 
2723   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2724   MI.eraseFromParent();
2725   return true;
2726 }
2727 
2728 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2729                                        MachineIRBuilder &B) const {
2730   Register Dst = MI.getOperand(0).getReg();
2731   Register Src = MI.getOperand(1).getReg();
2732   unsigned Flags = MI.getFlags();
2733   LLT Ty = B.getMRI()->getType(Dst);
2734 
2735   auto K = B.buildFConstant(Ty, numbers::log2e);
2736   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2737   B.buildFExp2(Dst, Mul, Flags);
2738   MI.eraseFromParent();
2739   return true;
2740 }
2741 
2742 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2743                                        MachineIRBuilder &B) const {
2744   Register Dst = MI.getOperand(0).getReg();
2745   Register Src0 = MI.getOperand(1).getReg();
2746   Register Src1 = MI.getOperand(2).getReg();
2747   unsigned Flags = MI.getFlags();
2748   LLT Ty = B.getMRI()->getType(Dst);
2749   const LLT S16 = LLT::scalar(16);
2750   const LLT S32 = LLT::scalar(32);
2751 
2752   if (Ty == S32) {
2753     auto Log = B.buildFLog2(S32, Src0, Flags);
2754     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2755       .addUse(Log.getReg(0))
2756       .addUse(Src1)
2757       .setMIFlags(Flags);
2758     B.buildFExp2(Dst, Mul, Flags);
2759   } else if (Ty == S16) {
2760     // There's no f16 fmul_legacy, so we need to convert for it.
2761     auto Log = B.buildFLog2(S16, Src0, Flags);
2762     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2763     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2764     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2765       .addUse(Ext0.getReg(0))
2766       .addUse(Ext1.getReg(0))
2767       .setMIFlags(Flags);
2768 
2769     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2770   } else
2771     return false;
2772 
2773   MI.eraseFromParent();
2774   return true;
2775 }
2776 
2777 // Find a source register, ignoring any possible source modifiers.
2778 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2779   Register ModSrc = OrigSrc;
2780   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2781     ModSrc = SrcFNeg->getOperand(1).getReg();
2782     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2783       ModSrc = SrcFAbs->getOperand(1).getReg();
2784   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2785     ModSrc = SrcFAbs->getOperand(1).getReg();
2786   return ModSrc;
2787 }
2788 
2789 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2790                                          MachineRegisterInfo &MRI,
2791                                          MachineIRBuilder &B) const {
2792 
2793   const LLT S1 = LLT::scalar(1);
2794   const LLT S64 = LLT::scalar(64);
2795   Register Dst = MI.getOperand(0).getReg();
2796   Register OrigSrc = MI.getOperand(1).getReg();
2797   unsigned Flags = MI.getFlags();
2798   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2799          "this should not have been custom lowered");
2800 
2801   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2802   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2803   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2804   // V_FRACT bug is:
2805   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2806   //
2807   // Convert floor(x) to (x - fract(x))
2808 
2809   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2810     .addUse(OrigSrc)
2811     .setMIFlags(Flags);
2812 
2813   // Give source modifier matching some assistance before obscuring a foldable
2814   // pattern.
2815 
2816   // TODO: We can avoid the neg on the fract? The input sign to fract
2817   // shouldn't matter?
2818   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2819 
2820   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2821 
2822   Register Min = MRI.createGenericVirtualRegister(S64);
2823 
2824   // We don't need to concern ourselves with the snan handling difference, so
2825   // use the one which will directly select.
2826   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2827   if (MFI->getMode().IEEE)
2828     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2829   else
2830     B.buildFMinNum(Min, Fract, Const, Flags);
2831 
2832   Register CorrectedFract = Min;
2833   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2834     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2835     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2836   }
2837 
2838   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2839   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2840 
2841   MI.eraseFromParent();
2842   return true;
2843 }
2844 
2845 // Turn an illegal packed v2s16 build vector into bit operations.
2846 // TODO: This should probably be a bitcast action in LegalizerHelper.
2847 bool AMDGPULegalizerInfo::legalizeBuildVector(
2848   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2849   Register Dst = MI.getOperand(0).getReg();
2850   const LLT S32 = LLT::scalar(32);
2851   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2852 
2853   Register Src0 = MI.getOperand(1).getReg();
2854   Register Src1 = MI.getOperand(2).getReg();
2855   assert(MRI.getType(Src0) == LLT::scalar(16));
2856 
2857   auto Merge = B.buildMerge(S32, {Src0, Src1});
2858   B.buildBitcast(Dst, Merge);
2859 
2860   MI.eraseFromParent();
2861   return true;
2862 }
2863 
2864 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2865 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2866 // case with a single min instruction instead of a compare+select.
2867 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2868                                             MachineRegisterInfo &MRI,
2869                                             MachineIRBuilder &B) const {
2870   Register Dst = MI.getOperand(0).getReg();
2871   Register Src = MI.getOperand(1).getReg();
2872   LLT DstTy = MRI.getType(Dst);
2873   LLT SrcTy = MRI.getType(Src);
2874 
2875   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2876                         ? AMDGPU::G_AMDGPU_FFBH_U32
2877                         : AMDGPU::G_AMDGPU_FFBL_B32;
2878   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2879   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2880 
2881   MI.eraseFromParent();
2882   return true;
2883 }
2884 
2885 // Check that this is a G_XOR x, -1
2886 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2887   if (MI.getOpcode() != TargetOpcode::G_XOR)
2888     return false;
2889   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2890   return ConstVal && *ConstVal == -1;
2891 }
2892 
2893 // Return the use branch instruction, otherwise null if the usage is invalid.
2894 static MachineInstr *
2895 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2896                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2897   Register CondDef = MI.getOperand(0).getReg();
2898   if (!MRI.hasOneNonDBGUse(CondDef))
2899     return nullptr;
2900 
2901   MachineBasicBlock *Parent = MI.getParent();
2902   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2903 
2904   if (isNot(MRI, *UseMI)) {
2905     Register NegatedCond = UseMI->getOperand(0).getReg();
2906     if (!MRI.hasOneNonDBGUse(NegatedCond))
2907       return nullptr;
2908 
2909     // We're deleting the def of this value, so we need to remove it.
2910     eraseInstr(*UseMI, MRI);
2911 
2912     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2913     Negated = true;
2914   }
2915 
2916   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2917     return nullptr;
2918 
2919   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2920   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2921   if (Next == Parent->end()) {
2922     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2923     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2924       return nullptr;
2925     UncondBrTarget = &*NextMBB;
2926   } else {
2927     if (Next->getOpcode() != AMDGPU::G_BR)
2928       return nullptr;
2929     Br = &*Next;
2930     UncondBrTarget = Br->getOperand(0).getMBB();
2931   }
2932 
2933   return UseMI;
2934 }
2935 
2936 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2937                                          const ArgDescriptor *Arg,
2938                                          const TargetRegisterClass *ArgRC,
2939                                          LLT ArgTy) const {
2940   MCRegister SrcReg = Arg->getRegister();
2941   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2942   assert(DstReg.isVirtual() && "Virtual register expected");
2943 
2944   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2945                                              *ArgRC, B.getDebugLoc(), ArgTy);
2946   if (Arg->isMasked()) {
2947     // TODO: Should we try to emit this once in the entry block?
2948     const LLT S32 = LLT::scalar(32);
2949     const unsigned Mask = Arg->getMask();
2950     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2951 
2952     Register AndMaskSrc = LiveIn;
2953 
2954     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2955     // 0.
2956     if (Shift != 0) {
2957       auto ShiftAmt = B.buildConstant(S32, Shift);
2958       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2959     }
2960 
2961     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2962   } else {
2963     B.buildCopy(DstReg, LiveIn);
2964   }
2965 
2966   return true;
2967 }
2968 
2969 bool AMDGPULegalizerInfo::loadInputValue(
2970     Register DstReg, MachineIRBuilder &B,
2971     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2972   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2973   const ArgDescriptor *Arg;
2974   const TargetRegisterClass *ArgRC;
2975   LLT ArgTy;
2976   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2977 
2978   if (!Arg) {
2979     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2980       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2981       // case the pointer argument may be missing and we use null.
2982       B.buildConstant(DstReg, 0);
2983       return true;
2984     }
2985 
2986     // It's undefined behavior if a function marked with the amdgpu-no-*
2987     // attributes uses the corresponding intrinsic.
2988     B.buildUndef(DstReg);
2989     return true;
2990   }
2991 
2992   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2993     return false; // TODO: Handle these
2994   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2995 }
2996 
2997 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2998     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2999     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3000   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3001     return false;
3002 
3003   MI.eraseFromParent();
3004   return true;
3005 }
3006 
3007 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3008                                 int64_t C) {
3009   B.buildConstant(MI.getOperand(0).getReg(), C);
3010   MI.eraseFromParent();
3011   return true;
3012 }
3013 
3014 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3015     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3016     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3017   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3018   if (MaxID == 0)
3019     return replaceWithConstant(B, MI, 0);
3020 
3021   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3022   const ArgDescriptor *Arg;
3023   const TargetRegisterClass *ArgRC;
3024   LLT ArgTy;
3025   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3026 
3027   Register DstReg = MI.getOperand(0).getReg();
3028   if (!Arg) {
3029     // It's undefined behavior if a function marked with the amdgpu-no-*
3030     // attributes uses the corresponding intrinsic.
3031     B.buildUndef(DstReg);
3032     MI.eraseFromParent();
3033     return true;
3034   }
3035 
3036   if (Arg->isMasked()) {
3037     // Don't bother inserting AssertZext for packed IDs since we're emitting the
3038     // masking operations anyway.
3039     //
3040     // TODO: We could assert the top bit is 0 for the source copy.
3041     if (!loadInputValue(DstReg, B, ArgType))
3042       return false;
3043   } else {
3044     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3045     if (!loadInputValue(TmpReg, B, ArgType))
3046       return false;
3047     B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3048   }
3049 
3050   MI.eraseFromParent();
3051   return true;
3052 }
3053 
3054 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3055                                                      int64_t Offset) const {
3056   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3057   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3058 
3059   // TODO: If we passed in the base kernel offset we could have a better
3060   // alignment than 4, but we don't really need it.
3061   if (!loadInputValue(KernArgReg, B,
3062                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3063     llvm_unreachable("failed to find kernarg segment ptr");
3064 
3065   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3066   // TODO: Should get nuw
3067   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3068 }
3069 
3070 /// Legalize a value that's loaded from kernel arguments. This is only used by
3071 /// legacy intrinsics.
3072 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3073                                                       MachineIRBuilder &B,
3074                                                       uint64_t Offset,
3075                                                       Align Alignment) const {
3076   Register DstReg = MI.getOperand(0).getReg();
3077 
3078   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3079          "unexpected kernarg parameter type");
3080 
3081   Register Ptr = getKernargParameterPtr(B, Offset);
3082   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3083   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3084               MachineMemOperand::MODereferenceable |
3085                   MachineMemOperand::MOInvariant);
3086   MI.eraseFromParent();
3087   return true;
3088 }
3089 
3090 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3091                                        MachineRegisterInfo &MRI,
3092                                        MachineIRBuilder &B) const {
3093   Register Dst = MI.getOperand(0).getReg();
3094   LLT DstTy = MRI.getType(Dst);
3095   LLT S16 = LLT::scalar(16);
3096   LLT S32 = LLT::scalar(32);
3097   LLT S64 = LLT::scalar(64);
3098 
3099   if (DstTy == S16)
3100     return legalizeFDIV16(MI, MRI, B);
3101   if (DstTy == S32)
3102     return legalizeFDIV32(MI, MRI, B);
3103   if (DstTy == S64)
3104     return legalizeFDIV64(MI, MRI, B);
3105 
3106   return false;
3107 }
3108 
3109 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3110                                                         Register DstDivReg,
3111                                                         Register DstRemReg,
3112                                                         Register X,
3113                                                         Register Y) const {
3114   const LLT S1 = LLT::scalar(1);
3115   const LLT S32 = LLT::scalar(32);
3116 
3117   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3118   // algorithm used here.
3119 
3120   // Initial estimate of inv(y).
3121   auto FloatY = B.buildUITOFP(S32, Y);
3122   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3123   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3124   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3125   auto Z = B.buildFPTOUI(S32, ScaledY);
3126 
3127   // One round of UNR.
3128   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3129   auto NegYZ = B.buildMul(S32, NegY, Z);
3130   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3131 
3132   // Quotient/remainder estimate.
3133   auto Q = B.buildUMulH(S32, X, Z);
3134   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3135 
3136   // First quotient/remainder refinement.
3137   auto One = B.buildConstant(S32, 1);
3138   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3139   if (DstDivReg)
3140     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3141   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3142 
3143   // Second quotient/remainder refinement.
3144   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3145   if (DstDivReg)
3146     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3147 
3148   if (DstRemReg)
3149     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3150 }
3151 
3152 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3153 //
3154 // Return lo, hi of result
3155 //
3156 // %cvt.lo = G_UITOFP Val.lo
3157 // %cvt.hi = G_UITOFP Val.hi
3158 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3159 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3160 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3161 // %mul2 = G_FMUL %mul1, 2**(-32)
3162 // %trunc = G_INTRINSIC_TRUNC %mul2
3163 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3164 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3165 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3166                                                        Register Val) {
3167   const LLT S32 = LLT::scalar(32);
3168   auto Unmerge = B.buildUnmerge(S32, Val);
3169 
3170   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3171   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3172 
3173   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3174                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3175 
3176   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3177   auto Mul1 =
3178       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3179 
3180   // 2**(-32)
3181   auto Mul2 =
3182       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3183   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3184 
3185   // -(2**32)
3186   auto Mad2 = B.buildFMAD(S32, Trunc,
3187                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3188 
3189   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3190   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3191 
3192   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3193 }
3194 
3195 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3196                                                         Register DstDivReg,
3197                                                         Register DstRemReg,
3198                                                         Register Numer,
3199                                                         Register Denom) const {
3200   const LLT S32 = LLT::scalar(32);
3201   const LLT S64 = LLT::scalar(64);
3202   const LLT S1 = LLT::scalar(1);
3203   Register RcpLo, RcpHi;
3204 
3205   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3206 
3207   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3208 
3209   auto Zero64 = B.buildConstant(S64, 0);
3210   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3211 
3212   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3213   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3214 
3215   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3216   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3217   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3218 
3219   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3220   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3221   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3222 
3223   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3224   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3225   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3226   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3227   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3228 
3229   auto Zero32 = B.buildConstant(S32, 0);
3230   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3231   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3232   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3233 
3234   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3235   Register NumerLo = UnmergeNumer.getReg(0);
3236   Register NumerHi = UnmergeNumer.getReg(1);
3237 
3238   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3239   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3240   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3241   Register Mul3_Lo = UnmergeMul3.getReg(0);
3242   Register Mul3_Hi = UnmergeMul3.getReg(1);
3243   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3244   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3245   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3246   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3247 
3248   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3249   Register DenomLo = UnmergeDenom.getReg(0);
3250   Register DenomHi = UnmergeDenom.getReg(1);
3251 
3252   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3253   auto C1 = B.buildSExt(S32, CmpHi);
3254 
3255   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3256   auto C2 = B.buildSExt(S32, CmpLo);
3257 
3258   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3259   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3260 
3261   // TODO: Here and below portions of the code can be enclosed into if/endif.
3262   // Currently control flow is unconditional and we have 4 selects after
3263   // potential endif to substitute PHIs.
3264 
3265   // if C3 != 0 ...
3266   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3267   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3268   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3269   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3270 
3271   auto One64 = B.buildConstant(S64, 1);
3272   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3273 
3274   auto C4 =
3275       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3276   auto C5 =
3277       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3278   auto C6 = B.buildSelect(
3279       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3280 
3281   // if (C6 != 0)
3282   auto Add4 = B.buildAdd(S64, Add3, One64);
3283   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3284 
3285   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3286   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3287   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3288 
3289   // endif C6
3290   // endif C3
3291 
3292   if (DstDivReg) {
3293     auto Sel1 = B.buildSelect(
3294         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3295     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3296                   Sel1, MulHi3);
3297   }
3298 
3299   if (DstRemReg) {
3300     auto Sel2 = B.buildSelect(
3301         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3302     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3303                   Sel2, Sub1);
3304   }
3305 }
3306 
3307 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3308                                                   MachineRegisterInfo &MRI,
3309                                                   MachineIRBuilder &B) const {
3310   Register DstDivReg, DstRemReg;
3311   switch (MI.getOpcode()) {
3312   default:
3313     llvm_unreachable("Unexpected opcode!");
3314   case AMDGPU::G_UDIV: {
3315     DstDivReg = MI.getOperand(0).getReg();
3316     break;
3317   }
3318   case AMDGPU::G_UREM: {
3319     DstRemReg = MI.getOperand(0).getReg();
3320     break;
3321   }
3322   case AMDGPU::G_UDIVREM: {
3323     DstDivReg = MI.getOperand(0).getReg();
3324     DstRemReg = MI.getOperand(1).getReg();
3325     break;
3326   }
3327   }
3328 
3329   const LLT S64 = LLT::scalar(64);
3330   const LLT S32 = LLT::scalar(32);
3331   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3332   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3333   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3334   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3335 
3336   if (Ty == S32)
3337     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3338   else if (Ty == S64)
3339     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3340   else
3341     return false;
3342 
3343   MI.eraseFromParent();
3344   return true;
3345 }
3346 
3347 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3348                                                 MachineRegisterInfo &MRI,
3349                                                 MachineIRBuilder &B) const {
3350   const LLT S64 = LLT::scalar(64);
3351   const LLT S32 = LLT::scalar(32);
3352 
3353   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3354   if (Ty != S32 && Ty != S64)
3355     return false;
3356 
3357   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3358   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3359   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3360 
3361   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3362   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3363   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3364 
3365   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3366   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3367 
3368   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3369   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3370 
3371   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3372   switch (MI.getOpcode()) {
3373   default:
3374     llvm_unreachable("Unexpected opcode!");
3375   case AMDGPU::G_SDIV: {
3376     DstDivReg = MI.getOperand(0).getReg();
3377     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3378     break;
3379   }
3380   case AMDGPU::G_SREM: {
3381     DstRemReg = MI.getOperand(0).getReg();
3382     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3383     break;
3384   }
3385   case AMDGPU::G_SDIVREM: {
3386     DstDivReg = MI.getOperand(0).getReg();
3387     DstRemReg = MI.getOperand(1).getReg();
3388     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3389     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3390     break;
3391   }
3392   }
3393 
3394   if (Ty == S32)
3395     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3396   else
3397     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3398 
3399   if (DstDivReg) {
3400     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3401     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3402     B.buildSub(DstDivReg, SignXor, Sign);
3403   }
3404 
3405   if (DstRemReg) {
3406     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3407     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3408     B.buildSub(DstRemReg, SignXor, Sign);
3409   }
3410 
3411   MI.eraseFromParent();
3412   return true;
3413 }
3414 
3415 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3416                                                  MachineRegisterInfo &MRI,
3417                                                  MachineIRBuilder &B) const {
3418   Register Res = MI.getOperand(0).getReg();
3419   Register LHS = MI.getOperand(1).getReg();
3420   Register RHS = MI.getOperand(2).getReg();
3421   uint16_t Flags = MI.getFlags();
3422   LLT ResTy = MRI.getType(Res);
3423 
3424   const MachineFunction &MF = B.getMF();
3425   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3426                             MI.getFlag(MachineInstr::FmAfn);
3427 
3428   if (!AllowInaccurateRcp)
3429     return false;
3430 
3431   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3432     // 1 / x -> RCP(x)
3433     if (CLHS->isExactlyValue(1.0)) {
3434       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3435         .addUse(RHS)
3436         .setMIFlags(Flags);
3437 
3438       MI.eraseFromParent();
3439       return true;
3440     }
3441 
3442     // -1 / x -> RCP( FNEG(x) )
3443     if (CLHS->isExactlyValue(-1.0)) {
3444       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3445       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3446         .addUse(FNeg.getReg(0))
3447         .setMIFlags(Flags);
3448 
3449       MI.eraseFromParent();
3450       return true;
3451     }
3452   }
3453 
3454   // x / y -> x * (1.0 / y)
3455   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3456     .addUse(RHS)
3457     .setMIFlags(Flags);
3458   B.buildFMul(Res, LHS, RCP, Flags);
3459 
3460   MI.eraseFromParent();
3461   return true;
3462 }
3463 
3464 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3465                                                    MachineRegisterInfo &MRI,
3466                                                    MachineIRBuilder &B) const {
3467   Register Res = MI.getOperand(0).getReg();
3468   Register X = MI.getOperand(1).getReg();
3469   Register Y = MI.getOperand(2).getReg();
3470   uint16_t Flags = MI.getFlags();
3471   LLT ResTy = MRI.getType(Res);
3472 
3473   const MachineFunction &MF = B.getMF();
3474   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3475                             MI.getFlag(MachineInstr::FmAfn);
3476 
3477   if (!AllowInaccurateRcp)
3478     return false;
3479 
3480   auto NegY = B.buildFNeg(ResTy, Y);
3481   auto One = B.buildFConstant(ResTy, 1.0);
3482 
3483   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3484     .addUse(Y)
3485     .setMIFlags(Flags);
3486 
3487   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3488   R = B.buildFMA(ResTy, Tmp0, R, R);
3489 
3490   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3491   R = B.buildFMA(ResTy, Tmp1, R, R);
3492 
3493   auto Ret = B.buildFMul(ResTy, X, R);
3494   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3495 
3496   B.buildFMA(Res, Tmp2, R, Ret);
3497   MI.eraseFromParent();
3498   return true;
3499 }
3500 
3501 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3502                                          MachineRegisterInfo &MRI,
3503                                          MachineIRBuilder &B) const {
3504   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3505     return true;
3506 
3507   Register Res = MI.getOperand(0).getReg();
3508   Register LHS = MI.getOperand(1).getReg();
3509   Register RHS = MI.getOperand(2).getReg();
3510 
3511   uint16_t Flags = MI.getFlags();
3512 
3513   LLT S16 = LLT::scalar(16);
3514   LLT S32 = LLT::scalar(32);
3515 
3516   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3517   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3518 
3519   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3520     .addUse(RHSExt.getReg(0))
3521     .setMIFlags(Flags);
3522 
3523   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3524   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3525 
3526   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3527     .addUse(RDst.getReg(0))
3528     .addUse(RHS)
3529     .addUse(LHS)
3530     .setMIFlags(Flags);
3531 
3532   MI.eraseFromParent();
3533   return true;
3534 }
3535 
3536 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3537 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3538 static void toggleSPDenormMode(bool Enable,
3539                                MachineIRBuilder &B,
3540                                const GCNSubtarget &ST,
3541                                AMDGPU::SIModeRegisterDefaults Mode) {
3542   // Set SP denorm mode to this value.
3543   unsigned SPDenormMode =
3544     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3545 
3546   if (ST.hasDenormModeInst()) {
3547     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3548     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3549 
3550     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3551     B.buildInstr(AMDGPU::S_DENORM_MODE)
3552       .addImm(NewDenormModeValue);
3553 
3554   } else {
3555     // Select FP32 bit field in mode register.
3556     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3557                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3558                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3559 
3560     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3561       .addImm(SPDenormMode)
3562       .addImm(SPDenormModeBitField);
3563   }
3564 }
3565 
3566 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3567                                          MachineRegisterInfo &MRI,
3568                                          MachineIRBuilder &B) const {
3569   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3570     return true;
3571 
3572   Register Res = MI.getOperand(0).getReg();
3573   Register LHS = MI.getOperand(1).getReg();
3574   Register RHS = MI.getOperand(2).getReg();
3575   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3576   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3577 
3578   uint16_t Flags = MI.getFlags();
3579 
3580   LLT S32 = LLT::scalar(32);
3581   LLT S1 = LLT::scalar(1);
3582 
3583   auto One = B.buildFConstant(S32, 1.0f);
3584 
3585   auto DenominatorScaled =
3586     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3587       .addUse(LHS)
3588       .addUse(RHS)
3589       .addImm(0)
3590       .setMIFlags(Flags);
3591   auto NumeratorScaled =
3592     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3593       .addUse(LHS)
3594       .addUse(RHS)
3595       .addImm(1)
3596       .setMIFlags(Flags);
3597 
3598   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3599     .addUse(DenominatorScaled.getReg(0))
3600     .setMIFlags(Flags);
3601   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3602 
3603   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3604   // aren't modeled as reading it.
3605   if (!Mode.allFP32Denormals())
3606     toggleSPDenormMode(true, B, ST, Mode);
3607 
3608   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3609   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3610   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3611   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3612   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3613   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3614 
3615   if (!Mode.allFP32Denormals())
3616     toggleSPDenormMode(false, B, ST, Mode);
3617 
3618   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3619     .addUse(Fma4.getReg(0))
3620     .addUse(Fma1.getReg(0))
3621     .addUse(Fma3.getReg(0))
3622     .addUse(NumeratorScaled.getReg(1))
3623     .setMIFlags(Flags);
3624 
3625   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3626     .addUse(Fmas.getReg(0))
3627     .addUse(RHS)
3628     .addUse(LHS)
3629     .setMIFlags(Flags);
3630 
3631   MI.eraseFromParent();
3632   return true;
3633 }
3634 
3635 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3636                                          MachineRegisterInfo &MRI,
3637                                          MachineIRBuilder &B) const {
3638   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3639     return true;
3640 
3641   Register Res = MI.getOperand(0).getReg();
3642   Register LHS = MI.getOperand(1).getReg();
3643   Register RHS = MI.getOperand(2).getReg();
3644 
3645   uint16_t Flags = MI.getFlags();
3646 
3647   LLT S64 = LLT::scalar(64);
3648   LLT S1 = LLT::scalar(1);
3649 
3650   auto One = B.buildFConstant(S64, 1.0);
3651 
3652   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3653     .addUse(LHS)
3654     .addUse(RHS)
3655     .addImm(0)
3656     .setMIFlags(Flags);
3657 
3658   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3659 
3660   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3661     .addUse(DivScale0.getReg(0))
3662     .setMIFlags(Flags);
3663 
3664   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3665   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3666   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3667 
3668   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3669     .addUse(LHS)
3670     .addUse(RHS)
3671     .addImm(1)
3672     .setMIFlags(Flags);
3673 
3674   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3675   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3676   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3677 
3678   Register Scale;
3679   if (!ST.hasUsableDivScaleConditionOutput()) {
3680     // Workaround a hardware bug on SI where the condition output from div_scale
3681     // is not usable.
3682 
3683     LLT S32 = LLT::scalar(32);
3684 
3685     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3686     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3687     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3688     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3689 
3690     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3691                               Scale1Unmerge.getReg(1));
3692     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3693                               Scale0Unmerge.getReg(1));
3694     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3695   } else {
3696     Scale = DivScale1.getReg(1);
3697   }
3698 
3699   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3700     .addUse(Fma4.getReg(0))
3701     .addUse(Fma3.getReg(0))
3702     .addUse(Mul.getReg(0))
3703     .addUse(Scale)
3704     .setMIFlags(Flags);
3705 
3706   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3707     .addUse(Fmas.getReg(0))
3708     .addUse(RHS)
3709     .addUse(LHS)
3710     .setMIFlags(Flags);
3711 
3712   MI.eraseFromParent();
3713   return true;
3714 }
3715 
3716 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3717                                                  MachineRegisterInfo &MRI,
3718                                                  MachineIRBuilder &B) const {
3719   Register Res = MI.getOperand(0).getReg();
3720   Register LHS = MI.getOperand(2).getReg();
3721   Register RHS = MI.getOperand(3).getReg();
3722   uint16_t Flags = MI.getFlags();
3723 
3724   LLT S32 = LLT::scalar(32);
3725   LLT S1 = LLT::scalar(1);
3726 
3727   auto Abs = B.buildFAbs(S32, RHS, Flags);
3728   const APFloat C0Val(1.0f);
3729 
3730   auto C0 = B.buildConstant(S32, 0x6f800000);
3731   auto C1 = B.buildConstant(S32, 0x2f800000);
3732   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3733 
3734   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3735   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3736 
3737   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3738 
3739   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3740     .addUse(Mul0.getReg(0))
3741     .setMIFlags(Flags);
3742 
3743   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3744 
3745   B.buildFMul(Res, Sel, Mul1, Flags);
3746 
3747   MI.eraseFromParent();
3748   return true;
3749 }
3750 
3751 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3752 // FIXME: Why do we handle this one but not other removed instructions?
3753 //
3754 // Reciprocal square root.  The clamp prevents infinite results, clamping
3755 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3756 // +-max_float.
3757 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3758                                                     MachineRegisterInfo &MRI,
3759                                                     MachineIRBuilder &B) const {
3760   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3761     return true;
3762 
3763   Register Dst = MI.getOperand(0).getReg();
3764   Register Src = MI.getOperand(2).getReg();
3765   auto Flags = MI.getFlags();
3766 
3767   LLT Ty = MRI.getType(Dst);
3768 
3769   const fltSemantics *FltSemantics;
3770   if (Ty == LLT::scalar(32))
3771     FltSemantics = &APFloat::IEEEsingle();
3772   else if (Ty == LLT::scalar(64))
3773     FltSemantics = &APFloat::IEEEdouble();
3774   else
3775     return false;
3776 
3777   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3778     .addUse(Src)
3779     .setMIFlags(Flags);
3780 
3781   // We don't need to concern ourselves with the snan handling difference, since
3782   // the rsq quieted (or not) so use the one which will directly select.
3783   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3784   const bool UseIEEE = MFI->getMode().IEEE;
3785 
3786   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3787   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3788                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3789 
3790   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3791 
3792   if (UseIEEE)
3793     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3794   else
3795     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3796   MI.eraseFromParent();
3797   return true;
3798 }
3799 
3800 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3801   switch (IID) {
3802   case Intrinsic::amdgcn_ds_fadd:
3803     return AMDGPU::G_ATOMICRMW_FADD;
3804   case Intrinsic::amdgcn_ds_fmin:
3805     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3806   case Intrinsic::amdgcn_ds_fmax:
3807     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3808   default:
3809     llvm_unreachable("not a DS FP intrinsic");
3810   }
3811 }
3812 
3813 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3814                                                       MachineInstr &MI,
3815                                                       Intrinsic::ID IID) const {
3816   GISelChangeObserver &Observer = Helper.Observer;
3817   Observer.changingInstr(MI);
3818 
3819   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3820 
3821   // The remaining operands were used to set fields in the MemOperand on
3822   // construction.
3823   for (int I = 6; I > 3; --I)
3824     MI.removeOperand(I);
3825 
3826   MI.removeOperand(1); // Remove the intrinsic ID.
3827   Observer.changedInstr(MI);
3828   return true;
3829 }
3830 
3831 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3832                                             MachineRegisterInfo &MRI,
3833                                             MachineIRBuilder &B) const {
3834   uint64_t Offset =
3835     ST.getTargetLowering()->getImplicitParameterOffset(
3836       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3837   LLT DstTy = MRI.getType(DstReg);
3838   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3839 
3840   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3841   if (!loadInputValue(KernargPtrReg, B,
3842                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3843     return false;
3844 
3845   // FIXME: This should be nuw
3846   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3847   return true;
3848 }
3849 
3850 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3851                                                  MachineRegisterInfo &MRI,
3852                                                  MachineIRBuilder &B) const {
3853   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3854   if (!MFI->isEntryFunction()) {
3855     return legalizePreloadedArgIntrin(MI, MRI, B,
3856                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3857   }
3858 
3859   Register DstReg = MI.getOperand(0).getReg();
3860   if (!getImplicitArgPtr(DstReg, MRI, B))
3861     return false;
3862 
3863   MI.eraseFromParent();
3864   return true;
3865 }
3866 
3867 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3868                                               MachineRegisterInfo &MRI,
3869                                               MachineIRBuilder &B,
3870                                               unsigned AddrSpace) const {
3871   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3872   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3873   Register Hi32 = Unmerge.getReg(1);
3874 
3875   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3876   MI.eraseFromParent();
3877   return true;
3878 }
3879 
3880 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3881 // offset (the offset that is included in bounds checking and swizzling, to be
3882 // split between the instruction's voffset and immoffset fields) and soffset
3883 // (the offset that is excluded from bounds checking and swizzling, to go in
3884 // the instruction's soffset field).  This function takes the first kind of
3885 // offset and figures out how to split it between voffset and immoffset.
3886 std::pair<Register, unsigned>
3887 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3888                                         Register OrigOffset) const {
3889   const unsigned MaxImm = 4095;
3890   Register BaseReg;
3891   unsigned ImmOffset;
3892   const LLT S32 = LLT::scalar(32);
3893   MachineRegisterInfo &MRI = *B.getMRI();
3894 
3895   std::tie(BaseReg, ImmOffset) =
3896       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3897 
3898   // If BaseReg is a pointer, convert it to int.
3899   if (MRI.getType(BaseReg).isPointer())
3900     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3901 
3902   // If the immediate value is too big for the immoffset field, put the value
3903   // and -4096 into the immoffset field so that the value that is copied/added
3904   // for the voffset field is a multiple of 4096, and it stands more chance
3905   // of being CSEd with the copy/add for another similar load/store.
3906   // However, do not do that rounding down to a multiple of 4096 if that is a
3907   // negative number, as it appears to be illegal to have a negative offset
3908   // in the vgpr, even if adding the immediate offset makes it positive.
3909   unsigned Overflow = ImmOffset & ~MaxImm;
3910   ImmOffset -= Overflow;
3911   if ((int32_t)Overflow < 0) {
3912     Overflow += ImmOffset;
3913     ImmOffset = 0;
3914   }
3915 
3916   if (Overflow != 0) {
3917     if (!BaseReg) {
3918       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3919     } else {
3920       auto OverflowVal = B.buildConstant(S32, Overflow);
3921       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3922     }
3923   }
3924 
3925   if (!BaseReg)
3926     BaseReg = B.buildConstant(S32, 0).getReg(0);
3927 
3928   return std::make_pair(BaseReg, ImmOffset);
3929 }
3930 
3931 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3932 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3933                                           Register VOffset, Register SOffset,
3934                                           unsigned ImmOffset, Register VIndex,
3935                                           MachineRegisterInfo &MRI) const {
3936   Optional<ValueAndVReg> MaybeVOffsetVal =
3937       getIConstantVRegValWithLookThrough(VOffset, MRI);
3938   Optional<ValueAndVReg> MaybeSOffsetVal =
3939       getIConstantVRegValWithLookThrough(SOffset, MRI);
3940   Optional<ValueAndVReg> MaybeVIndexVal =
3941       getIConstantVRegValWithLookThrough(VIndex, MRI);
3942   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3943   // update the MMO with that offset. The stride is unknown so we can only do
3944   // this if VIndex is constant 0.
3945   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3946       MaybeVIndexVal->Value == 0) {
3947     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3948                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3949     MMO->setOffset(TotalOffset);
3950   } else {
3951     // We don't have a constant combined offset to use in the MMO. Give up.
3952     MMO->setValue((Value *)nullptr);
3953   }
3954 }
3955 
3956 /// Handle register layout difference for f16 images for some subtargets.
3957 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3958                                              MachineRegisterInfo &MRI,
3959                                              Register Reg,
3960                                              bool ImageStore) const {
3961   const LLT S16 = LLT::scalar(16);
3962   const LLT S32 = LLT::scalar(32);
3963   LLT StoreVT = MRI.getType(Reg);
3964   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3965 
3966   if (ST.hasUnpackedD16VMem()) {
3967     auto Unmerge = B.buildUnmerge(S16, Reg);
3968 
3969     SmallVector<Register, 4> WideRegs;
3970     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3971       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3972 
3973     int NumElts = StoreVT.getNumElements();
3974 
3975     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3976         .getReg(0);
3977   }
3978 
3979   if (ImageStore && ST.hasImageStoreD16Bug()) {
3980     if (StoreVT.getNumElements() == 2) {
3981       SmallVector<Register, 4> PackedRegs;
3982       Reg = B.buildBitcast(S32, Reg).getReg(0);
3983       PackedRegs.push_back(Reg);
3984       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3985       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3986           .getReg(0);
3987     }
3988 
3989     if (StoreVT.getNumElements() == 3) {
3990       SmallVector<Register, 4> PackedRegs;
3991       auto Unmerge = B.buildUnmerge(S16, Reg);
3992       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3993         PackedRegs.push_back(Unmerge.getReg(I));
3994       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3995       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3996       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3997     }
3998 
3999     if (StoreVT.getNumElements() == 4) {
4000       SmallVector<Register, 4> PackedRegs;
4001       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4002       auto Unmerge = B.buildUnmerge(S32, Reg);
4003       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4004         PackedRegs.push_back(Unmerge.getReg(I));
4005       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4006       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4007           .getReg(0);
4008     }
4009 
4010     llvm_unreachable("invalid data type");
4011   }
4012 
4013   if (StoreVT == LLT::fixed_vector(3, S16)) {
4014     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4015               .getReg(0);
4016   }
4017   return Reg;
4018 }
4019 
4020 Register AMDGPULegalizerInfo::fixStoreSourceType(
4021   MachineIRBuilder &B, Register VData, bool IsFormat) const {
4022   MachineRegisterInfo *MRI = B.getMRI();
4023   LLT Ty = MRI->getType(VData);
4024 
4025   const LLT S16 = LLT::scalar(16);
4026 
4027   // Fixup illegal register types for i8 stores.
4028   if (Ty == LLT::scalar(8) || Ty == S16) {
4029     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4030     return AnyExt;
4031   }
4032 
4033   if (Ty.isVector()) {
4034     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4035       if (IsFormat)
4036         return handleD16VData(B, *MRI, VData);
4037     }
4038   }
4039 
4040   return VData;
4041 }
4042 
4043 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4044                                               MachineRegisterInfo &MRI,
4045                                               MachineIRBuilder &B,
4046                                               bool IsTyped,
4047                                               bool IsFormat) const {
4048   Register VData = MI.getOperand(1).getReg();
4049   LLT Ty = MRI.getType(VData);
4050   LLT EltTy = Ty.getScalarType();
4051   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4052   const LLT S32 = LLT::scalar(32);
4053 
4054   VData = fixStoreSourceType(B, VData, IsFormat);
4055   Register RSrc = MI.getOperand(2).getReg();
4056 
4057   MachineMemOperand *MMO = *MI.memoperands_begin();
4058   const int MemSize = MMO->getSize();
4059 
4060   unsigned ImmOffset;
4061 
4062   // The typed intrinsics add an immediate after the registers.
4063   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4064 
4065   // The struct intrinsic variants add one additional operand over raw.
4066   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4067   Register VIndex;
4068   int OpOffset = 0;
4069   if (HasVIndex) {
4070     VIndex = MI.getOperand(3).getReg();
4071     OpOffset = 1;
4072   } else {
4073     VIndex = B.buildConstant(S32, 0).getReg(0);
4074   }
4075 
4076   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4077   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4078 
4079   unsigned Format = 0;
4080   if (IsTyped) {
4081     Format = MI.getOperand(5 + OpOffset).getImm();
4082     ++OpOffset;
4083   }
4084 
4085   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4086 
4087   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4088   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4089 
4090   unsigned Opc;
4091   if (IsTyped) {
4092     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4093                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4094   } else if (IsFormat) {
4095     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4096                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4097   } else {
4098     switch (MemSize) {
4099     case 1:
4100       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4101       break;
4102     case 2:
4103       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4104       break;
4105     default:
4106       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4107       break;
4108     }
4109   }
4110 
4111   auto MIB = B.buildInstr(Opc)
4112     .addUse(VData)              // vdata
4113     .addUse(RSrc)               // rsrc
4114     .addUse(VIndex)             // vindex
4115     .addUse(VOffset)            // voffset
4116     .addUse(SOffset)            // soffset
4117     .addImm(ImmOffset);         // offset(imm)
4118 
4119   if (IsTyped)
4120     MIB.addImm(Format);
4121 
4122   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4123      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4124      .addMemOperand(MMO);
4125 
4126   MI.eraseFromParent();
4127   return true;
4128 }
4129 
4130 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4131                                              MachineRegisterInfo &MRI,
4132                                              MachineIRBuilder &B,
4133                                              bool IsFormat,
4134                                              bool IsTyped) const {
4135   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4136   MachineMemOperand *MMO = *MI.memoperands_begin();
4137   const LLT MemTy = MMO->getMemoryType();
4138   const LLT S32 = LLT::scalar(32);
4139 
4140   Register Dst = MI.getOperand(0).getReg();
4141   Register RSrc = MI.getOperand(2).getReg();
4142 
4143   // The typed intrinsics add an immediate after the registers.
4144   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4145 
4146   // The struct intrinsic variants add one additional operand over raw.
4147   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4148   Register VIndex;
4149   int OpOffset = 0;
4150   if (HasVIndex) {
4151     VIndex = MI.getOperand(3).getReg();
4152     OpOffset = 1;
4153   } else {
4154     VIndex = B.buildConstant(S32, 0).getReg(0);
4155   }
4156 
4157   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4158   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4159 
4160   unsigned Format = 0;
4161   if (IsTyped) {
4162     Format = MI.getOperand(5 + OpOffset).getImm();
4163     ++OpOffset;
4164   }
4165 
4166   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4167   unsigned ImmOffset;
4168 
4169   LLT Ty = MRI.getType(Dst);
4170   LLT EltTy = Ty.getScalarType();
4171   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4172   const bool Unpacked = ST.hasUnpackedD16VMem();
4173 
4174   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4175   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4176 
4177   unsigned Opc;
4178 
4179   if (IsTyped) {
4180     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4181                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4182   } else if (IsFormat) {
4183     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4184                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4185   } else {
4186     switch (MemTy.getSizeInBits()) {
4187     case 8:
4188       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4189       break;
4190     case 16:
4191       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4192       break;
4193     default:
4194       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4195       break;
4196     }
4197   }
4198 
4199   Register LoadDstReg;
4200 
4201   bool IsExtLoad =
4202       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4203   LLT UnpackedTy = Ty.changeElementSize(32);
4204 
4205   if (IsExtLoad)
4206     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4207   else if (Unpacked && IsD16 && Ty.isVector())
4208     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4209   else
4210     LoadDstReg = Dst;
4211 
4212   auto MIB = B.buildInstr(Opc)
4213     .addDef(LoadDstReg)         // vdata
4214     .addUse(RSrc)               // rsrc
4215     .addUse(VIndex)             // vindex
4216     .addUse(VOffset)            // voffset
4217     .addUse(SOffset)            // soffset
4218     .addImm(ImmOffset);         // offset(imm)
4219 
4220   if (IsTyped)
4221     MIB.addImm(Format);
4222 
4223   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4224      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4225      .addMemOperand(MMO);
4226 
4227   if (LoadDstReg != Dst) {
4228     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4229 
4230     // Widen result for extending loads was widened.
4231     if (IsExtLoad)
4232       B.buildTrunc(Dst, LoadDstReg);
4233     else {
4234       // Repack to original 16-bit vector result
4235       // FIXME: G_TRUNC should work, but legalization currently fails
4236       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4237       SmallVector<Register, 4> Repack;
4238       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4239         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4240       B.buildMerge(Dst, Repack);
4241     }
4242   }
4243 
4244   MI.eraseFromParent();
4245   return true;
4246 }
4247 
4248 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4249                                                MachineIRBuilder &B,
4250                                                bool IsInc) const {
4251   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4252                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4253   B.buildInstr(Opc)
4254     .addDef(MI.getOperand(0).getReg())
4255     .addUse(MI.getOperand(2).getReg())
4256     .addUse(MI.getOperand(3).getReg())
4257     .cloneMemRefs(MI);
4258   MI.eraseFromParent();
4259   return true;
4260 }
4261 
4262 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4263   switch (IntrID) {
4264   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4265   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4266     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4267   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4268   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4269     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4270   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4271   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4272     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4273   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4274   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4275     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4276   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4277   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4278     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4279   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4280   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4281     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4282   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4283   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4284     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4285   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4286   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4287     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4288   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4289   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4290     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4291   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4292   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4293     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4294   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4295   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4296     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4297   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4298   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4299     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4300   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4301   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4302     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4303   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4304   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4305     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4306   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4307   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4308     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4309   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4310   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4311     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4312   default:
4313     llvm_unreachable("unhandled atomic opcode");
4314   }
4315 }
4316 
4317 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4318                                                MachineIRBuilder &B,
4319                                                Intrinsic::ID IID) const {
4320   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4321                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4322   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4323 
4324   Register Dst;
4325 
4326   int OpOffset = 0;
4327   if (HasReturn) {
4328     // A few FP atomics do not support return values.
4329     Dst = MI.getOperand(0).getReg();
4330   } else {
4331     OpOffset = -1;
4332   }
4333 
4334   Register VData = MI.getOperand(2 + OpOffset).getReg();
4335   Register CmpVal;
4336 
4337   if (IsCmpSwap) {
4338     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4339     ++OpOffset;
4340   }
4341 
4342   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4343   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4344 
4345   // The struct intrinsic variants add one additional operand over raw.
4346   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4347   Register VIndex;
4348   if (HasVIndex) {
4349     VIndex = MI.getOperand(4 + OpOffset).getReg();
4350     ++OpOffset;
4351   } else {
4352     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4353   }
4354 
4355   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4356   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4357   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4358 
4359   MachineMemOperand *MMO = *MI.memoperands_begin();
4360 
4361   unsigned ImmOffset;
4362   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4363   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4364 
4365   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4366 
4367   if (HasReturn)
4368     MIB.addDef(Dst);
4369 
4370   MIB.addUse(VData); // vdata
4371 
4372   if (IsCmpSwap)
4373     MIB.addReg(CmpVal);
4374 
4375   MIB.addUse(RSrc)               // rsrc
4376      .addUse(VIndex)             // vindex
4377      .addUse(VOffset)            // voffset
4378      .addUse(SOffset)            // soffset
4379      .addImm(ImmOffset)          // offset(imm)
4380      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4381      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4382      .addMemOperand(MMO);
4383 
4384   MI.eraseFromParent();
4385   return true;
4386 }
4387 
4388 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4389 /// vector with s16 typed elements.
4390 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4391                                       SmallVectorImpl<Register> &PackedAddrs,
4392                                       unsigned ArgOffset,
4393                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4394                                       bool IsA16, bool IsG16) {
4395   const LLT S16 = LLT::scalar(16);
4396   const LLT V2S16 = LLT::fixed_vector(2, 16);
4397   auto EndIdx = Intr->VAddrEnd;
4398 
4399   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4400     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4401     if (!SrcOp.isReg())
4402       continue; // _L to _LZ may have eliminated this.
4403 
4404     Register AddrReg = SrcOp.getReg();
4405 
4406     if ((I < Intr->GradientStart) ||
4407         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4408         (I >= Intr->CoordStart && !IsA16)) {
4409       if ((I < Intr->GradientStart) && IsA16 &&
4410           (B.getMRI()->getType(AddrReg) == S16)) {
4411         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4412         // Special handling of bias when A16 is on. Bias is of type half but
4413         // occupies full 32-bit.
4414         PackedAddrs.push_back(
4415             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4416                 .getReg(0));
4417       } else {
4418         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4419                "Bias needs to be converted to 16 bit in A16 mode");
4420         // Handle any gradient or coordinate operands that should not be packed
4421         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4422         PackedAddrs.push_back(AddrReg);
4423       }
4424     } else {
4425       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4426       // derivatives dx/dh and dx/dv are packed with undef.
4427       if (((I + 1) >= EndIdx) ||
4428           ((Intr->NumGradients / 2) % 2 == 1 &&
4429            (I == static_cast<unsigned>(Intr->GradientStart +
4430                                        (Intr->NumGradients / 2) - 1) ||
4431             I == static_cast<unsigned>(Intr->GradientStart +
4432                                        Intr->NumGradients - 1))) ||
4433           // Check for _L to _LZ optimization
4434           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4435         PackedAddrs.push_back(
4436             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4437                 .getReg(0));
4438       } else {
4439         PackedAddrs.push_back(
4440             B.buildBuildVector(
4441                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4442                 .getReg(0));
4443         ++I;
4444       }
4445     }
4446   }
4447 }
4448 
4449 /// Convert from separate vaddr components to a single vector address register,
4450 /// and replace the remaining operands with $noreg.
4451 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4452                                      int DimIdx, int NumVAddrs) {
4453   const LLT S32 = LLT::scalar(32);
4454 
4455   SmallVector<Register, 8> AddrRegs;
4456   for (int I = 0; I != NumVAddrs; ++I) {
4457     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4458     if (SrcOp.isReg()) {
4459       AddrRegs.push_back(SrcOp.getReg());
4460       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4461     }
4462   }
4463 
4464   int NumAddrRegs = AddrRegs.size();
4465   if (NumAddrRegs != 1) {
4466     // Above 8 elements round up to next power of 2 (i.e. 16).
4467     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4468       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4469       auto Undef = B.buildUndef(S32);
4470       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4471       NumAddrRegs = RoundedNumRegs;
4472     }
4473 
4474     auto VAddr =
4475         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4476     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4477   }
4478 
4479   for (int I = 1; I != NumVAddrs; ++I) {
4480     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4481     if (SrcOp.isReg())
4482       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4483   }
4484 }
4485 
4486 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4487 ///
4488 /// Depending on the subtarget, load/store with 16-bit element data need to be
4489 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4490 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4491 /// registers.
4492 ///
4493 /// We don't want to directly select image instructions just yet, but also want
4494 /// to exposes all register repacking to the legalizer/combiners. We also don't
4495 /// want a selected instruction entering RegBankSelect. In order to avoid
4496 /// defining a multitude of intermediate image instructions, directly hack on
4497 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4498 /// padding now unnecessary arguments with $noreg.
4499 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4500     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4501     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4502 
4503   const unsigned NumDefs = MI.getNumExplicitDefs();
4504   const unsigned ArgOffset = NumDefs + 1;
4505   bool IsTFE = NumDefs == 2;
4506   // We are only processing the operands of d16 image operations on subtargets
4507   // that use the unpacked register layout, or need to repack the TFE result.
4508 
4509   // TODO: Do we need to guard against already legalized intrinsics?
4510   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4511       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4512 
4513   MachineRegisterInfo *MRI = B.getMRI();
4514   const LLT S32 = LLT::scalar(32);
4515   const LLT S16 = LLT::scalar(16);
4516   const LLT V2S16 = LLT::fixed_vector(2, 16);
4517 
4518   unsigned DMask = 0;
4519   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4520   LLT Ty = MRI->getType(VData);
4521 
4522   // Check for 16 bit addresses and pack if true.
4523   LLT GradTy =
4524       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4525   LLT AddrTy =
4526       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4527   const bool IsG16 = GradTy == S16;
4528   const bool IsA16 = AddrTy == S16;
4529   const bool IsD16 = Ty.getScalarType() == S16;
4530 
4531   int DMaskLanes = 0;
4532   if (!BaseOpcode->Atomic) {
4533     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4534     if (BaseOpcode->Gather4) {
4535       DMaskLanes = 4;
4536     } else if (DMask != 0) {
4537       DMaskLanes = countPopulation(DMask);
4538     } else if (!IsTFE && !BaseOpcode->Store) {
4539       // If dmask is 0, this is a no-op load. This can be eliminated.
4540       B.buildUndef(MI.getOperand(0));
4541       MI.eraseFromParent();
4542       return true;
4543     }
4544   }
4545 
4546   Observer.changingInstr(MI);
4547   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4548 
4549   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4550                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4551   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4552                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4553   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4554 
4555   // Track that we legalized this
4556   MI.setDesc(B.getTII().get(NewOpcode));
4557 
4558   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4559   // dmask to be at least 1 otherwise the instruction will fail
4560   if (IsTFE && DMask == 0) {
4561     DMask = 0x1;
4562     DMaskLanes = 1;
4563     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4564   }
4565 
4566   if (BaseOpcode->Atomic) {
4567     Register VData0 = MI.getOperand(2).getReg();
4568     LLT Ty = MRI->getType(VData0);
4569 
4570     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4571     if (Ty.isVector())
4572       return false;
4573 
4574     if (BaseOpcode->AtomicX2) {
4575       Register VData1 = MI.getOperand(3).getReg();
4576       // The two values are packed in one register.
4577       LLT PackedTy = LLT::fixed_vector(2, Ty);
4578       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4579       MI.getOperand(2).setReg(Concat.getReg(0));
4580       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4581     }
4582   }
4583 
4584   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4585 
4586   // Rewrite the addressing register layout before doing anything else.
4587   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4588     // 16 bit gradients are supported, but are tied to the A16 control
4589     // so both gradients and addresses must be 16 bit
4590     return false;
4591   }
4592 
4593   if (IsA16 && !ST.hasA16()) {
4594     // A16 not supported
4595     return false;
4596   }
4597 
4598   if (IsA16 || IsG16) {
4599     if (Intr->NumVAddrs > 1) {
4600       SmallVector<Register, 4> PackedRegs;
4601 
4602       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4603                                 IsG16);
4604 
4605       // See also below in the non-a16 branch
4606       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4607                           PackedRegs.size() <= ST.getNSAMaxSize();
4608 
4609       if (!UseNSA && PackedRegs.size() > 1) {
4610         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4611         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4612         PackedRegs[0] = Concat.getReg(0);
4613         PackedRegs.resize(1);
4614       }
4615 
4616       const unsigned NumPacked = PackedRegs.size();
4617       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4618         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4619         if (!SrcOp.isReg()) {
4620           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4621           continue;
4622         }
4623 
4624         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4625 
4626         if (I - Intr->VAddrStart < NumPacked)
4627           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4628         else
4629           SrcOp.setReg(AMDGPU::NoRegister);
4630       }
4631     }
4632   } else {
4633     // If the register allocator cannot place the address registers contiguously
4634     // without introducing moves, then using the non-sequential address encoding
4635     // is always preferable, since it saves VALU instructions and is usually a
4636     // wash in terms of code size or even better.
4637     //
4638     // However, we currently have no way of hinting to the register allocator
4639     // that MIMG addresses should be placed contiguously when it is possible to
4640     // do so, so force non-NSA for the common 2-address case as a heuristic.
4641     //
4642     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4643     // allocation when possible.
4644     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4645                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4646 
4647     if (!UseNSA && Intr->NumVAddrs > 1)
4648       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4649                                Intr->NumVAddrs);
4650   }
4651 
4652   int Flags = 0;
4653   if (IsA16)
4654     Flags |= 1;
4655   if (IsG16)
4656     Flags |= 2;
4657   MI.addOperand(MachineOperand::CreateImm(Flags));
4658 
4659   if (BaseOpcode->Store) { // No TFE for stores?
4660     // TODO: Handle dmask trim
4661     if (!Ty.isVector() || !IsD16)
4662       return true;
4663 
4664     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4665     if (RepackedReg != VData) {
4666       MI.getOperand(1).setReg(RepackedReg);
4667     }
4668 
4669     return true;
4670   }
4671 
4672   Register DstReg = MI.getOperand(0).getReg();
4673   const LLT EltTy = Ty.getScalarType();
4674   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4675 
4676   // Confirm that the return type is large enough for the dmask specified
4677   if (NumElts < DMaskLanes)
4678     return false;
4679 
4680   if (NumElts > 4 || DMaskLanes > 4)
4681     return false;
4682 
4683   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4684   const LLT AdjustedTy =
4685       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4686 
4687   // The raw dword aligned data component of the load. The only legal cases
4688   // where this matters should be when using the packed D16 format, for
4689   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4690   LLT RoundedTy;
4691 
4692   // S32 vector to to cover all data, plus TFE result element.
4693   LLT TFETy;
4694 
4695   // Register type to use for each loaded component. Will be S32 or V2S16.
4696   LLT RegTy;
4697 
4698   if (IsD16 && ST.hasUnpackedD16VMem()) {
4699     RoundedTy =
4700         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4701     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4702     RegTy = S32;
4703   } else {
4704     unsigned EltSize = EltTy.getSizeInBits();
4705     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4706     unsigned RoundedSize = 32 * RoundedElts;
4707     RoundedTy = LLT::scalarOrVector(
4708         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4709     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4710     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4711   }
4712 
4713   // The return type does not need adjustment.
4714   // TODO: Should we change s16 case to s32 or <2 x s16>?
4715   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4716     return true;
4717 
4718   Register Dst1Reg;
4719 
4720   // Insert after the instruction.
4721   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4722 
4723   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4724   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4725   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4726   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4727 
4728   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4729 
4730   MI.getOperand(0).setReg(NewResultReg);
4731 
4732   // In the IR, TFE is supposed to be used with a 2 element struct return
4733   // type. The instruction really returns these two values in one contiguous
4734   // register, with one additional dword beyond the loaded data. Rewrite the
4735   // return type to use a single register result.
4736 
4737   if (IsTFE) {
4738     Dst1Reg = MI.getOperand(1).getReg();
4739     if (MRI->getType(Dst1Reg) != S32)
4740       return false;
4741 
4742     // TODO: Make sure the TFE operand bit is set.
4743     MI.removeOperand(1);
4744 
4745     // Handle the easy case that requires no repack instructions.
4746     if (Ty == S32) {
4747       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4748       return true;
4749     }
4750   }
4751 
4752   // Now figure out how to copy the new result register back into the old
4753   // result.
4754   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4755 
4756   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4757 
4758   if (ResultNumRegs == 1) {
4759     assert(!IsTFE);
4760     ResultRegs[0] = NewResultReg;
4761   } else {
4762     // We have to repack into a new vector of some kind.
4763     for (int I = 0; I != NumDataRegs; ++I)
4764       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4765     B.buildUnmerge(ResultRegs, NewResultReg);
4766 
4767     // Drop the final TFE element to get the data part. The TFE result is
4768     // directly written to the right place already.
4769     if (IsTFE)
4770       ResultRegs.resize(NumDataRegs);
4771   }
4772 
4773   // For an s16 scalar result, we form an s32 result with a truncate regardless
4774   // of packed vs. unpacked.
4775   if (IsD16 && !Ty.isVector()) {
4776     B.buildTrunc(DstReg, ResultRegs[0]);
4777     return true;
4778   }
4779 
4780   // Avoid a build/concat_vector of 1 entry.
4781   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4782     B.buildBitcast(DstReg, ResultRegs[0]);
4783     return true;
4784   }
4785 
4786   assert(Ty.isVector());
4787 
4788   if (IsD16) {
4789     // For packed D16 results with TFE enabled, all the data components are
4790     // S32. Cast back to the expected type.
4791     //
4792     // TODO: We don't really need to use load s32 elements. We would only need one
4793     // cast for the TFE result if a multiple of v2s16 was used.
4794     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4795       for (Register &Reg : ResultRegs)
4796         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4797     } else if (ST.hasUnpackedD16VMem()) {
4798       for (Register &Reg : ResultRegs)
4799         Reg = B.buildTrunc(S16, Reg).getReg(0);
4800     }
4801   }
4802 
4803   auto padWithUndef = [&](LLT Ty, int NumElts) {
4804     if (NumElts == 0)
4805       return;
4806     Register Undef = B.buildUndef(Ty).getReg(0);
4807     for (int I = 0; I != NumElts; ++I)
4808       ResultRegs.push_back(Undef);
4809   };
4810 
4811   // Pad out any elements eliminated due to the dmask.
4812   LLT ResTy = MRI->getType(ResultRegs[0]);
4813   if (!ResTy.isVector()) {
4814     padWithUndef(ResTy, NumElts - ResultRegs.size());
4815     B.buildBuildVector(DstReg, ResultRegs);
4816     return true;
4817   }
4818 
4819   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4820   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4821 
4822   // Deal with the one annoying legal case.
4823   const LLT V3S16 = LLT::fixed_vector(3, 16);
4824   if (Ty == V3S16) {
4825     if (IsTFE) {
4826       if (ResultRegs.size() == 1) {
4827         NewResultReg = ResultRegs[0];
4828       } else if (ResultRegs.size() == 2) {
4829         LLT V4S16 = LLT::fixed_vector(4, 16);
4830         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4831       } else {
4832         return false;
4833       }
4834     }
4835 
4836     if (MRI->getType(DstReg).getNumElements() <
4837         MRI->getType(NewResultReg).getNumElements()) {
4838       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4839     } else {
4840       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4841     }
4842     return true;
4843   }
4844 
4845   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4846   B.buildConcatVectors(DstReg, ResultRegs);
4847   return true;
4848 }
4849 
4850 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4851   LegalizerHelper &Helper, MachineInstr &MI) const {
4852   MachineIRBuilder &B = Helper.MIRBuilder;
4853   GISelChangeObserver &Observer = Helper.Observer;
4854 
4855   Register Dst = MI.getOperand(0).getReg();
4856   LLT Ty = B.getMRI()->getType(Dst);
4857   unsigned Size = Ty.getSizeInBits();
4858   MachineFunction &MF = B.getMF();
4859 
4860   Observer.changingInstr(MI);
4861 
4862   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4863     Ty = getBitcastRegisterType(Ty);
4864     Helper.bitcastDst(MI, Ty, 0);
4865     Dst = MI.getOperand(0).getReg();
4866     B.setInsertPt(B.getMBB(), MI);
4867   }
4868 
4869   // FIXME: We don't really need this intermediate instruction. The intrinsic
4870   // should be fixed to have a memory operand. Since it's readnone, we're not
4871   // allowed to add one.
4872   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4873   MI.removeOperand(1); // Remove intrinsic ID
4874 
4875   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4876   // TODO: Should this use datalayout alignment?
4877   const unsigned MemSize = (Size + 7) / 8;
4878   const Align MemAlign(4);
4879   MachineMemOperand *MMO = MF.getMachineMemOperand(
4880       MachinePointerInfo(),
4881       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4882           MachineMemOperand::MOInvariant,
4883       MemSize, MemAlign);
4884   MI.addMemOperand(MF, MMO);
4885 
4886   // There are no 96-bit result scalar loads, but widening to 128-bit should
4887   // always be legal. We may need to restore this to a 96-bit result if it turns
4888   // out this needs to be converted to a vector load during RegBankSelect.
4889   if (!isPowerOf2_32(Size)) {
4890     if (Ty.isVector())
4891       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4892     else
4893       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4894   }
4895 
4896   Observer.changedInstr(MI);
4897   return true;
4898 }
4899 
4900 // TODO: Move to selection
4901 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4902                                                 MachineRegisterInfo &MRI,
4903                                                 MachineIRBuilder &B) const {
4904   if (!ST.isTrapHandlerEnabled() ||
4905       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4906     return legalizeTrapEndpgm(MI, MRI, B);
4907 
4908   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4909     switch (*HsaAbiVer) {
4910     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4911     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4912       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4913     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4914     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4915       return ST.supportsGetDoorbellID() ?
4916           legalizeTrapHsa(MI, MRI, B) :
4917           legalizeTrapHsaQueuePtr(MI, MRI, B);
4918     }
4919   }
4920 
4921   llvm_unreachable("Unknown trap handler");
4922 }
4923 
4924 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4925     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4926   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4927   MI.eraseFromParent();
4928   return true;
4929 }
4930 
4931 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4932     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4933   MachineFunction &MF = B.getMF();
4934   const LLT S64 = LLT::scalar(64);
4935 
4936   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4937   // For code object version 5, queue_ptr is passed through implicit kernarg.
4938   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
4939     AMDGPUTargetLowering::ImplicitParameter Param =
4940         AMDGPUTargetLowering::QUEUE_PTR;
4941     uint64_t Offset =
4942         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
4943 
4944     Register KernargPtrReg = MRI.createGenericVirtualRegister(
4945         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4946 
4947     if (!loadInputValue(KernargPtrReg, B,
4948                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4949       return false;
4950 
4951     // TODO: can we be smarter about machine pointer info?
4952     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4953     MachineMemOperand *MMO = MF.getMachineMemOperand(
4954         PtrInfo,
4955         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4956             MachineMemOperand::MOInvariant,
4957         LLT::scalar(64), commonAlignment(Align(64), Offset));
4958 
4959     // Pointer address
4960     Register LoadAddr = MRI.createGenericVirtualRegister(
4961         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4962     B.buildPtrAdd(LoadAddr, KernargPtrReg,
4963                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
4964     // Load address
4965     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
4966     B.buildCopy(SGPR01, Temp);
4967     B.buildInstr(AMDGPU::S_TRAP)
4968         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4969         .addReg(SGPR01, RegState::Implicit);
4970     MI.eraseFromParent();
4971     return true;
4972   }
4973 
4974   // Pass queue pointer to trap handler as input, and insert trap instruction
4975   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4976   Register LiveIn =
4977     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4978   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4979     return false;
4980 
4981   B.buildCopy(SGPR01, LiveIn);
4982   B.buildInstr(AMDGPU::S_TRAP)
4983       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4984       .addReg(SGPR01, RegState::Implicit);
4985 
4986   MI.eraseFromParent();
4987   return true;
4988 }
4989 
4990 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4991     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4992   B.buildInstr(AMDGPU::S_TRAP)
4993       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4994   MI.eraseFromParent();
4995   return true;
4996 }
4997 
4998 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4999     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5000   // Is non-HSA path or trap-handler disabled? Then, report a warning
5001   // accordingly
5002   if (!ST.isTrapHandlerEnabled() ||
5003       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5004     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5005                                      "debugtrap handler not supported",
5006                                      MI.getDebugLoc(), DS_Warning);
5007     LLVMContext &Ctx = B.getMF().getFunction().getContext();
5008     Ctx.diagnose(NoTrap);
5009   } else {
5010     // Insert debug-trap instruction
5011     B.buildInstr(AMDGPU::S_TRAP)
5012         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5013   }
5014 
5015   MI.eraseFromParent();
5016   return true;
5017 }
5018 
5019 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5020                                                MachineIRBuilder &B) const {
5021   MachineRegisterInfo &MRI = *B.getMRI();
5022   const LLT S16 = LLT::scalar(16);
5023   const LLT S32 = LLT::scalar(32);
5024 
5025   Register DstReg = MI.getOperand(0).getReg();
5026   Register NodePtr = MI.getOperand(2).getReg();
5027   Register RayExtent = MI.getOperand(3).getReg();
5028   Register RayOrigin = MI.getOperand(4).getReg();
5029   Register RayDir = MI.getOperand(5).getReg();
5030   Register RayInvDir = MI.getOperand(6).getReg();
5031   Register TDescr = MI.getOperand(7).getReg();
5032 
5033   if (!ST.hasGFX10_AEncoding()) {
5034     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5035                                         "intrinsic not supported on subtarget",
5036                                         MI.getDebugLoc());
5037     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5038     return false;
5039   }
5040 
5041   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5042   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5043   const unsigned NumVDataDwords = 4;
5044   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5045   const bool UseNSA =
5046       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
5047   const unsigned BaseOpcodes[2][2] = {
5048       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5049       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5050        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5051   int Opcode;
5052   if (UseNSA) {
5053     Opcode =
5054         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
5055                               NumVDataDwords, NumVAddrDwords);
5056   } else {
5057     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5058                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
5059                                    PowerOf2Ceil(NumVAddrDwords));
5060   }
5061   assert(Opcode != -1);
5062 
5063   SmallVector<Register, 12> Ops;
5064   if (Is64) {
5065     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5066     Ops.push_back(Unmerge.getReg(0));
5067     Ops.push_back(Unmerge.getReg(1));
5068   } else {
5069     Ops.push_back(NodePtr);
5070   }
5071   Ops.push_back(RayExtent);
5072 
5073   auto packLanes = [&Ops, &S32, &B](Register Src) {
5074     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5075     Ops.push_back(Unmerge.getReg(0));
5076     Ops.push_back(Unmerge.getReg(1));
5077     Ops.push_back(Unmerge.getReg(2));
5078   };
5079 
5080   packLanes(RayOrigin);
5081   if (IsA16) {
5082     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5083     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5084     Register R1 = MRI.createGenericVirtualRegister(S32);
5085     Register R2 = MRI.createGenericVirtualRegister(S32);
5086     Register R3 = MRI.createGenericVirtualRegister(S32);
5087     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5088     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5089     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5090     Ops.push_back(R1);
5091     Ops.push_back(R2);
5092     Ops.push_back(R3);
5093   } else {
5094     packLanes(RayDir);
5095     packLanes(RayInvDir);
5096   }
5097 
5098   if (!UseNSA) {
5099     // Build a single vector containing all the operands so far prepared.
5100     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5101     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5102     Ops.clear();
5103     Ops.push_back(MergedOps);
5104   }
5105 
5106   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5107     .addDef(DstReg)
5108     .addImm(Opcode);
5109 
5110   for (Register R : Ops) {
5111     MIB.addUse(R);
5112   }
5113 
5114   MIB.addUse(TDescr)
5115      .addImm(IsA16 ? 1 : 0)
5116      .cloneMemRefs(MI);
5117 
5118   MI.eraseFromParent();
5119   return true;
5120 }
5121 
5122 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5123                                                MachineIRBuilder &B) const {
5124   unsigned Opc;
5125   int RoundMode = MI.getOperand(2).getImm();
5126 
5127   if (RoundMode == (int)RoundingMode::TowardPositive)
5128     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5129   else if (RoundMode == (int)RoundingMode::TowardNegative)
5130     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5131   else
5132     return false;
5133 
5134   B.buildInstr(Opc)
5135       .addDef(MI.getOperand(0).getReg())
5136       .addUse(MI.getOperand(1).getReg());
5137 
5138   MI.eraseFromParent();
5139 
5140   return true;
5141 }
5142 
5143 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5144                                             MachineInstr &MI) const {
5145   MachineIRBuilder &B = Helper.MIRBuilder;
5146   MachineRegisterInfo &MRI = *B.getMRI();
5147 
5148   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5149   auto IntrID = MI.getIntrinsicID();
5150   switch (IntrID) {
5151   case Intrinsic::amdgcn_if:
5152   case Intrinsic::amdgcn_else: {
5153     MachineInstr *Br = nullptr;
5154     MachineBasicBlock *UncondBrTarget = nullptr;
5155     bool Negated = false;
5156     if (MachineInstr *BrCond =
5157             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5158       const SIRegisterInfo *TRI
5159         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5160 
5161       Register Def = MI.getOperand(1).getReg();
5162       Register Use = MI.getOperand(3).getReg();
5163 
5164       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5165 
5166       if (Negated)
5167         std::swap(CondBrTarget, UncondBrTarget);
5168 
5169       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5170       if (IntrID == Intrinsic::amdgcn_if) {
5171         B.buildInstr(AMDGPU::SI_IF)
5172           .addDef(Def)
5173           .addUse(Use)
5174           .addMBB(UncondBrTarget);
5175       } else {
5176         B.buildInstr(AMDGPU::SI_ELSE)
5177             .addDef(Def)
5178             .addUse(Use)
5179             .addMBB(UncondBrTarget);
5180       }
5181 
5182       if (Br) {
5183         Br->getOperand(0).setMBB(CondBrTarget);
5184       } else {
5185         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5186         // since we're swapping branch targets it needs to be reinserted.
5187         // FIXME: IRTranslator should probably not do this
5188         B.buildBr(*CondBrTarget);
5189       }
5190 
5191       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5192       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5193       MI.eraseFromParent();
5194       BrCond->eraseFromParent();
5195       return true;
5196     }
5197 
5198     return false;
5199   }
5200   case Intrinsic::amdgcn_loop: {
5201     MachineInstr *Br = nullptr;
5202     MachineBasicBlock *UncondBrTarget = nullptr;
5203     bool Negated = false;
5204     if (MachineInstr *BrCond =
5205             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5206       const SIRegisterInfo *TRI
5207         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5208 
5209       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5210       Register Reg = MI.getOperand(2).getReg();
5211 
5212       if (Negated)
5213         std::swap(CondBrTarget, UncondBrTarget);
5214 
5215       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5216       B.buildInstr(AMDGPU::SI_LOOP)
5217         .addUse(Reg)
5218         .addMBB(UncondBrTarget);
5219 
5220       if (Br)
5221         Br->getOperand(0).setMBB(CondBrTarget);
5222       else
5223         B.buildBr(*CondBrTarget);
5224 
5225       MI.eraseFromParent();
5226       BrCond->eraseFromParent();
5227       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5228       return true;
5229     }
5230 
5231     return false;
5232   }
5233   case Intrinsic::amdgcn_kernarg_segment_ptr:
5234     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5235       // This only makes sense to call in a kernel, so just lower to null.
5236       B.buildConstant(MI.getOperand(0).getReg(), 0);
5237       MI.eraseFromParent();
5238       return true;
5239     }
5240 
5241     return legalizePreloadedArgIntrin(
5242       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5243   case Intrinsic::amdgcn_implicitarg_ptr:
5244     return legalizeImplicitArgPtr(MI, MRI, B);
5245   case Intrinsic::amdgcn_workitem_id_x:
5246     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5247                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5248   case Intrinsic::amdgcn_workitem_id_y:
5249     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5250                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5251   case Intrinsic::amdgcn_workitem_id_z:
5252     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5253                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5254   case Intrinsic::amdgcn_workgroup_id_x:
5255     return legalizePreloadedArgIntrin(MI, MRI, B,
5256                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5257   case Intrinsic::amdgcn_workgroup_id_y:
5258     return legalizePreloadedArgIntrin(MI, MRI, B,
5259                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5260   case Intrinsic::amdgcn_workgroup_id_z:
5261     return legalizePreloadedArgIntrin(MI, MRI, B,
5262                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5263   case Intrinsic::amdgcn_dispatch_ptr:
5264     return legalizePreloadedArgIntrin(MI, MRI, B,
5265                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5266   case Intrinsic::amdgcn_queue_ptr:
5267     return legalizePreloadedArgIntrin(MI, MRI, B,
5268                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5269   case Intrinsic::amdgcn_implicit_buffer_ptr:
5270     return legalizePreloadedArgIntrin(
5271       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5272   case Intrinsic::amdgcn_dispatch_id:
5273     return legalizePreloadedArgIntrin(MI, MRI, B,
5274                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5275   case Intrinsic::r600_read_ngroups_x:
5276     // TODO: Emit error for hsa
5277     return legalizeKernargMemParameter(MI, B,
5278                                        SI::KernelInputOffsets::NGROUPS_X);
5279   case Intrinsic::r600_read_ngroups_y:
5280     return legalizeKernargMemParameter(MI, B,
5281                                        SI::KernelInputOffsets::NGROUPS_Y);
5282   case Intrinsic::r600_read_ngroups_z:
5283     return legalizeKernargMemParameter(MI, B,
5284                                        SI::KernelInputOffsets::NGROUPS_Z);
5285   case Intrinsic::r600_read_local_size_x:
5286     // TODO: Could insert G_ASSERT_ZEXT from s16
5287     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5288   case Intrinsic::r600_read_local_size_y:
5289     // TODO: Could insert G_ASSERT_ZEXT from s16
5290     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
5291     // TODO: Could insert G_ASSERT_ZEXT from s16
5292   case Intrinsic::r600_read_local_size_z:
5293     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5294   case Intrinsic::r600_read_global_size_x:
5295     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5296   case Intrinsic::r600_read_global_size_y:
5297     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5298   case Intrinsic::r600_read_global_size_z:
5299     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5300   case Intrinsic::amdgcn_fdiv_fast:
5301     return legalizeFDIVFastIntrin(MI, MRI, B);
5302   case Intrinsic::amdgcn_is_shared:
5303     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5304   case Intrinsic::amdgcn_is_private:
5305     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5306   case Intrinsic::amdgcn_wavefrontsize: {
5307     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5308     MI.eraseFromParent();
5309     return true;
5310   }
5311   case Intrinsic::amdgcn_s_buffer_load:
5312     return legalizeSBufferLoad(Helper, MI);
5313   case Intrinsic::amdgcn_raw_buffer_store:
5314   case Intrinsic::amdgcn_struct_buffer_store:
5315     return legalizeBufferStore(MI, MRI, B, false, false);
5316   case Intrinsic::amdgcn_raw_buffer_store_format:
5317   case Intrinsic::amdgcn_struct_buffer_store_format:
5318     return legalizeBufferStore(MI, MRI, B, false, true);
5319   case Intrinsic::amdgcn_raw_tbuffer_store:
5320   case Intrinsic::amdgcn_struct_tbuffer_store:
5321     return legalizeBufferStore(MI, MRI, B, true, true);
5322   case Intrinsic::amdgcn_raw_buffer_load:
5323   case Intrinsic::amdgcn_struct_buffer_load:
5324     return legalizeBufferLoad(MI, MRI, B, false, false);
5325   case Intrinsic::amdgcn_raw_buffer_load_format:
5326   case Intrinsic::amdgcn_struct_buffer_load_format:
5327     return legalizeBufferLoad(MI, MRI, B, true, false);
5328   case Intrinsic::amdgcn_raw_tbuffer_load:
5329   case Intrinsic::amdgcn_struct_tbuffer_load:
5330     return legalizeBufferLoad(MI, MRI, B, true, true);
5331   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5332   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5333   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5334   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5335   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5336   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5337   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5338   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5339   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5340   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5341   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5342   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5343   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5344   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5345   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5346   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5347   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5348   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5349   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5350   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5351   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5352   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5353   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5354   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5355   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5356   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5357   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5358   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5359   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5360   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5361     return legalizeBufferAtomic(MI, B, IntrID);
5362   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5363   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5364     Register DstReg = MI.getOperand(0).getReg();
5365     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5366       Function &F = B.getMF().getFunction();
5367       DiagnosticInfoUnsupported NoFpRet(
5368           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5369           DS_Error);
5370       F.getContext().diagnose(NoFpRet);
5371       B.buildUndef(DstReg);
5372       MI.eraseFromParent();
5373       return true;
5374     }
5375 
5376     return legalizeBufferAtomic(MI, B, IntrID);
5377   }
5378   case Intrinsic::amdgcn_atomic_inc:
5379     return legalizeAtomicIncDec(MI, B, true);
5380   case Intrinsic::amdgcn_atomic_dec:
5381     return legalizeAtomicIncDec(MI, B, false);
5382   case Intrinsic::trap:
5383     return legalizeTrapIntrinsic(MI, MRI, B);
5384   case Intrinsic::debugtrap:
5385     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5386   case Intrinsic::amdgcn_rsq_clamp:
5387     return legalizeRsqClampIntrinsic(MI, MRI, B);
5388   case Intrinsic::amdgcn_ds_fadd:
5389   case Intrinsic::amdgcn_ds_fmin:
5390   case Intrinsic::amdgcn_ds_fmax:
5391     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5392   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5393     return legalizeBVHIntrinsic(MI, B);
5394   default: {
5395     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5396             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5397       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5398     return true;
5399   }
5400   }
5401 
5402   return true;
5403 }
5404