1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .widenScalarToNextPow2(1, 32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32);
936 
937 
938   // The hardware instructions return a different result on 0 than the generic
939   // instructions expect. The hardware produces -1, but these produce the
940   // bitwidth.
941   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
942     .scalarize(0)
943     .clampScalar(0, S32, S32)
944     .clampScalar(1, S32, S64)
945     .widenScalarToNextPow2(0, 32)
946     .widenScalarToNextPow2(1, 32)
947     .custom();
948 
949   // The 64-bit versions produce 32-bit results, but only on the SALU.
950   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
951     .legalFor({{S32, S32}, {S32, S64}})
952     .clampScalar(0, S32, S32)
953     .clampScalar(1, S32, S64)
954     .scalarize(0)
955     .widenScalarToNextPow2(0, 32)
956     .widenScalarToNextPow2(1, 32);
957 
958   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
959   // RegBankSelect.
960   getActionDefinitionsBuilder(G_BITREVERSE)
961     .legalFor({S32, S64})
962     .clampScalar(0, S32, S64)
963     .scalarize(0)
964     .widenScalarToNextPow2(0);
965 
966   if (ST.has16BitInsts()) {
967     getActionDefinitionsBuilder(G_BSWAP)
968       .legalFor({S16, S32, V2S16})
969       .clampMaxNumElementsStrict(0, S16, 2)
970       // FIXME: Fixing non-power-of-2 before clamp is workaround for
971       // narrowScalar limitation.
972       .widenScalarToNextPow2(0)
973       .clampScalar(0, S16, S32)
974       .scalarize(0);
975 
976     if (ST.hasVOP3PInsts()) {
977       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
978         .legalFor({S32, S16, V2S16})
979         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
980         .clampMaxNumElements(0, S16, 2)
981         .minScalar(0, S16)
982         .widenScalarToNextPow2(0)
983         .scalarize(0)
984         .lower();
985     } else {
986       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
987         .legalFor({S32, S16})
988         .widenScalarToNextPow2(0)
989         .minScalar(0, S16)
990         .scalarize(0)
991         .lower();
992     }
993   } else {
994     // TODO: Should have same legality without v_perm_b32
995     getActionDefinitionsBuilder(G_BSWAP)
996       .legalFor({S32})
997       .lowerIf(scalarNarrowerThan(0, 32))
998       // FIXME: Fixing non-power-of-2 before clamp is workaround for
999       // narrowScalar limitation.
1000       .widenScalarToNextPow2(0)
1001       .maxScalar(0, S32)
1002       .scalarize(0)
1003       .lower();
1004 
1005     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1006       .legalFor({S32})
1007       .minScalar(0, S32)
1008       .widenScalarToNextPow2(0)
1009       .scalarize(0)
1010       .lower();
1011   }
1012 
1013   getActionDefinitionsBuilder(G_INTTOPTR)
1014     // List the common cases
1015     .legalForCartesianProduct(AddrSpaces64, {S64})
1016     .legalForCartesianProduct(AddrSpaces32, {S32})
1017     .scalarize(0)
1018     // Accept any address space as long as the size matches
1019     .legalIf(sameSize(0, 1))
1020     .widenScalarIf(smallerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       })
1024     .narrowScalarIf(largerThan(1, 0),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1027       });
1028 
1029   getActionDefinitionsBuilder(G_PTRTOINT)
1030     // List the common cases
1031     .legalForCartesianProduct(AddrSpaces64, {S64})
1032     .legalForCartesianProduct(AddrSpaces32, {S32})
1033     .scalarize(0)
1034     // Accept any address space as long as the size matches
1035     .legalIf(sameSize(0, 1))
1036     .widenScalarIf(smallerThan(0, 1),
1037       [](const LegalityQuery &Query) {
1038         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1039       })
1040     .narrowScalarIf(
1041       largerThan(0, 1),
1042       [](const LegalityQuery &Query) {
1043         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044       });
1045 
1046   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1047     .scalarize(0)
1048     .custom();
1049 
1050   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1051                                     bool IsLoad) -> bool {
1052     const LLT DstTy = Query.Types[0];
1053 
1054     // Split vector extloads.
1055     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     return false;
1078   };
1079 
1080   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1081   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1082   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1083 
1084   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1085   // LDS
1086   // TODO: Unsupported flat for SI.
1087 
1088   for (unsigned Op : {G_LOAD, G_STORE}) {
1089     const bool IsStore = Op == G_STORE;
1090 
1091     auto &Actions = getActionDefinitionsBuilder(Op);
1092     // Explicitly list some common cases.
1093     // TODO: Does this help compile time at all?
1094     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1095                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1096                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1097                                       {S64, GlobalPtr, S64, GlobalAlign32},
1098                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1099                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1100                                       {S32, GlobalPtr, S8, GlobalAlign8},
1101                                       {S32, GlobalPtr, S16, GlobalAlign16},
1102 
1103                                       {S32, LocalPtr, S32, 32},
1104                                       {S64, LocalPtr, S64, 32},
1105                                       {V2S32, LocalPtr, V2S32, 32},
1106                                       {S32, LocalPtr, S8, 8},
1107                                       {S32, LocalPtr, S16, 16},
1108                                       {V2S16, LocalPtr, S32, 32},
1109 
1110                                       {S32, PrivatePtr, S32, 32},
1111                                       {S32, PrivatePtr, S8, 8},
1112                                       {S32, PrivatePtr, S16, 16},
1113                                       {V2S16, PrivatePtr, S32, 32},
1114 
1115                                       {S32, ConstantPtr, S32, GlobalAlign32},
1116                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1117                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1118                                       {S64, ConstantPtr, S64, GlobalAlign32},
1119                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1120     Actions.legalIf(
1121       [=](const LegalityQuery &Query) -> bool {
1122         return isLoadStoreLegal(ST, Query);
1123       });
1124 
1125     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1126     // 64-bits.
1127     //
1128     // TODO: Should generalize bitcast action into coerce, which will also cover
1129     // inserting addrspacecasts.
1130     Actions.customIf(typeIs(1, Constant32Ptr));
1131 
1132     // Turn any illegal element vectors into something easier to deal
1133     // with. These will ultimately produce 32-bit scalar shifts to extract the
1134     // parts anyway.
1135     //
1136     // For odd 16-bit element vectors, prefer to split those into pieces with
1137     // 16-bit vector parts.
1138     Actions.bitcastIf(
1139       [=](const LegalityQuery &Query) -> bool {
1140         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1141                                           Query.MMODescrs[0].MemoryTy);
1142       }, bitcastToRegisterType(0));
1143 
1144     if (!IsStore) {
1145       // Widen suitably aligned loads by loading extra bytes. The standard
1146       // legalization actions can't properly express widening memory operands.
1147       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1148         return shouldWidenLoad(ST, Query, G_LOAD);
1149       });
1150     }
1151 
1152     // FIXME: load/store narrowing should be moved to lower action
1153     Actions
1154         .narrowScalarIf(
1155             [=](const LegalityQuery &Query) -> bool {
1156               return !Query.Types[0].isVector() &&
1157                      needToSplitMemOp(Query, Op == G_LOAD);
1158             },
1159             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1160               const LLT DstTy = Query.Types[0];
1161               const LLT PtrTy = Query.Types[1];
1162 
1163               const unsigned DstSize = DstTy.getSizeInBits();
1164               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1165 
1166               // Split extloads.
1167               if (DstSize > MemSize)
1168                 return std::make_pair(0, LLT::scalar(MemSize));
1169 
1170               unsigned MaxSize = maxSizeForAddrSpace(ST,
1171                                                      PtrTy.getAddressSpace(),
1172                                                      Op == G_LOAD);
1173               if (MemSize > MaxSize)
1174                 return std::make_pair(0, LLT::scalar(MaxSize));
1175 
1176               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1177               return std::make_pair(0, LLT::scalar(Align));
1178             })
1179         .fewerElementsIf(
1180             [=](const LegalityQuery &Query) -> bool {
1181               return Query.Types[0].isVector() &&
1182                      needToSplitMemOp(Query, Op == G_LOAD);
1183             },
1184             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1185               const LLT DstTy = Query.Types[0];
1186               const LLT PtrTy = Query.Types[1];
1187 
1188               LLT EltTy = DstTy.getElementType();
1189               unsigned MaxSize = maxSizeForAddrSpace(ST,
1190                                                      PtrTy.getAddressSpace(),
1191                                                      Op == G_LOAD);
1192 
1193               // FIXME: Handle widened to power of 2 results better. This ends
1194               // up scalarizing.
1195               // FIXME: 3 element stores scalarized on SI
1196 
1197               // Split if it's too large for the address space.
1198               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1199               if (MemSize > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                       0, LLT::scalarOrVector(
1206                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1207                 }
1208 
1209                 unsigned NumPieces = MemSize / MaxSize;
1210 
1211                 // FIXME: Refine when odd breakdowns handled
1212                 // The scalars will need to be re-legalized.
1213                 if (NumPieces == 1 || NumPieces >= NumElts ||
1214                     NumElts % NumPieces != 0)
1215                   return std::make_pair(0, EltTy);
1216 
1217                 return std::make_pair(
1218                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1219               }
1220 
1221               // FIXME: We could probably handle weird extending loads better.
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                     0, LLT::scalarOrVector(
1234                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1235               }
1236 
1237               // May need relegalization for the scalars.
1238               return std::make_pair(0, EltTy);
1239             })
1240     .minScalar(0, S32)
1241     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1242     .widenScalarToNextPow2(0)
1243     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1244     .lower();
1245   }
1246 
1247   // FIXME: Unaligned accesses not lowered.
1248   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1249                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1250                                                   {S32, GlobalPtr, S16, 2 * 8},
1251                                                   {S32, LocalPtr, S8, 8},
1252                                                   {S32, LocalPtr, S16, 16},
1253                                                   {S32, PrivatePtr, S8, 8},
1254                                                   {S32, PrivatePtr, S16, 16},
1255                                                   {S32, ConstantPtr, S8, 8},
1256                                                   {S32, ConstantPtr, S16, 2 * 8}})
1257                        .legalIf(
1258                          [=](const LegalityQuery &Query) -> bool {
1259                            return isLoadStoreLegal(ST, Query);
1260                          });
1261 
1262   if (ST.hasFlatAddressSpace()) {
1263     ExtLoads.legalForTypesWithMemDesc(
1264         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1265   }
1266 
1267   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1268   // 64-bits.
1269   //
1270   // TODO: Should generalize bitcast action into coerce, which will also cover
1271   // inserting addrspacecasts.
1272   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1273 
1274   ExtLoads.clampScalar(0, S32, S32)
1275           .widenScalarToNextPow2(0)
1276           .lower();
1277 
1278   auto &Atomics = getActionDefinitionsBuilder(
1279     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1280      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1281      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1282      G_ATOMICRMW_UMIN})
1283     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1284                {S64, GlobalPtr}, {S64, LocalPtr},
1285                {S32, RegionPtr}, {S64, RegionPtr}});
1286   if (ST.hasFlatAddressSpace()) {
1287     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1288   }
1289 
1290   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1291   if (ST.hasLDSFPAtomicAdd()) {
1292     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1293     if (ST.hasGFX90AInsts())
1294       Atomic.legalFor({{S64, LocalPtr}});
1295   }
1296   if (ST.hasAtomicFaddInsts())
1297     Atomic.legalFor({{S32, GlobalPtr}});
1298 
1299   if (ST.hasGFX90AInsts()) {
1300     // These are legal with some caveats, and should have undergone expansion in
1301     // the IR in most situations
1302     // TODO: Move atomic expansion into legalizer
1303     // TODO: Also supports <2 x f16>
1304     Atomic.legalFor({
1305         {S32, GlobalPtr},
1306         {S64, GlobalPtr},
1307         {S64, FlatPtr}
1308       });
1309   }
1310 
1311   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1312   // demarshalling
1313   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1314     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1315                 {S32, FlatPtr}, {S64, FlatPtr}})
1316     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1317                {S32, RegionPtr}, {S64, RegionPtr}});
1318   // TODO: Pointer types, any 32-bit or 64-bit vector
1319 
1320   // Condition should be s32 for scalar, s1 for vector.
1321   getActionDefinitionsBuilder(G_SELECT)
1322       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1323                                  LocalPtr, FlatPtr, PrivatePtr,
1324                                  LLT::fixed_vector(2, LocalPtr),
1325                                  LLT::fixed_vector(2, PrivatePtr)},
1326                                 {S1, S32})
1327       .clampScalar(0, S16, S64)
1328       .scalarize(1)
1329       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1330       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1331       .clampMaxNumElements(0, S32, 2)
1332       .clampMaxNumElements(0, LocalPtr, 2)
1333       .clampMaxNumElements(0, PrivatePtr, 2)
1334       .scalarize(0)
1335       .widenScalarToNextPow2(0)
1336       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1337 
1338   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1339   // be more flexible with the shift amount type.
1340   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1341     .legalFor({{S32, S32}, {S64, S32}});
1342   if (ST.has16BitInsts()) {
1343     if (ST.hasVOP3PInsts()) {
1344       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1345             .clampMaxNumElements(0, S16, 2);
1346     } else
1347       Shifts.legalFor({{S16, S16}});
1348 
1349     // TODO: Support 16-bit shift amounts for all types
1350     Shifts.widenScalarIf(
1351       [=](const LegalityQuery &Query) {
1352         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1353         // 32-bit amount.
1354         const LLT ValTy = Query.Types[0];
1355         const LLT AmountTy = Query.Types[1];
1356         return ValTy.getSizeInBits() <= 16 &&
1357                AmountTy.getSizeInBits() < 16;
1358       }, changeTo(1, S16));
1359     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1360     Shifts.clampScalar(1, S32, S32);
1361     Shifts.widenScalarToNextPow2(0, 16);
1362     Shifts.clampScalar(0, S16, S64);
1363 
1364     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1365       .minScalar(0, S16)
1366       .scalarize(0)
1367       .lower();
1368   } else {
1369     // Make sure we legalize the shift amount type first, as the general
1370     // expansion for the shifted type will produce much worse code if it hasn't
1371     // been truncated already.
1372     Shifts.clampScalar(1, S32, S32);
1373     Shifts.widenScalarToNextPow2(0, 32);
1374     Shifts.clampScalar(0, S32, S64);
1375 
1376     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1377       .minScalar(0, S32)
1378       .scalarize(0)
1379       .lower();
1380   }
1381   Shifts.scalarize(0);
1382 
1383   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1384     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1385     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1386     unsigned IdxTypeIdx = 2;
1387 
1388     getActionDefinitionsBuilder(Op)
1389       .customIf([=](const LegalityQuery &Query) {
1390           const LLT EltTy = Query.Types[EltTypeIdx];
1391           const LLT VecTy = Query.Types[VecTypeIdx];
1392           const LLT IdxTy = Query.Types[IdxTypeIdx];
1393           const unsigned EltSize = EltTy.getSizeInBits();
1394           return (EltSize == 32 || EltSize == 64) &&
1395                   VecTy.getSizeInBits() % 32 == 0 &&
1396                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1397                   IdxTy.getSizeInBits() == 32;
1398         })
1399       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1400                  bitcastToVectorElement32(VecTypeIdx))
1401       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1402       .bitcastIf(
1403         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1404         [=](const LegalityQuery &Query) {
1405           // For > 64-bit element types, try to turn this into a 64-bit
1406           // element vector since we may be able to do better indexing
1407           // if this is scalar. If not, fall back to 32.
1408           const LLT EltTy = Query.Types[EltTypeIdx];
1409           const LLT VecTy = Query.Types[VecTypeIdx];
1410           const unsigned DstEltSize = EltTy.getSizeInBits();
1411           const unsigned VecSize = VecTy.getSizeInBits();
1412 
1413           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1414           return std::make_pair(
1415               VecTypeIdx,
1416               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1417         })
1418       .clampScalar(EltTypeIdx, S32, S64)
1419       .clampScalar(VecTypeIdx, S32, S64)
1420       .clampScalar(IdxTypeIdx, S32, S32)
1421       .clampMaxNumElements(VecTypeIdx, S32, 32)
1422       // TODO: Clamp elements for 64-bit vectors?
1423       // It should only be necessary with variable indexes.
1424       // As a last resort, lower to the stack
1425       .lower();
1426   }
1427 
1428   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1429     .unsupportedIf([=](const LegalityQuery &Query) {
1430         const LLT &EltTy = Query.Types[1].getElementType();
1431         return Query.Types[0] != EltTy;
1432       });
1433 
1434   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1435     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1436     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1437 
1438     // FIXME: Doesn't handle extract of illegal sizes.
1439     getActionDefinitionsBuilder(Op)
1440       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1441       .lowerIf([=](const LegalityQuery &Query) {
1442           // Sub-vector(or single element) insert and extract.
1443           // TODO: verify immediate offset here since lower only works with
1444           // whole elements.
1445           const LLT BigTy = Query.Types[BigTyIdx];
1446           return BigTy.isVector();
1447         })
1448       // FIXME: Multiples of 16 should not be legal.
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT BigTy = Query.Types[BigTyIdx];
1451           const LLT LitTy = Query.Types[LitTyIdx];
1452           return (BigTy.getSizeInBits() % 32 == 0) &&
1453                  (LitTy.getSizeInBits() % 16 == 0);
1454         })
1455       .widenScalarIf(
1456         [=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           return (BigTy.getScalarSizeInBits() < 16);
1459         },
1460         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (LitTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468       .widenScalarToNextPow2(BigTyIdx, 32);
1469 
1470   }
1471 
1472   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473     .legalForCartesianProduct(AllS32Vectors, {S32})
1474     .legalForCartesianProduct(AllS64Vectors, {S64})
1475     .clampNumElements(0, V16S32, V32S32)
1476     .clampNumElements(0, V2S64, V16S64)
1477     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478 
1479   if (ST.hasScalarPackInsts()) {
1480     BuildVector
1481       // FIXME: Should probably widen s1 vectors straight to s32
1482       .minScalarOrElt(0, S16)
1483       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484       .minScalar(1, S32);
1485 
1486     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487       .legalFor({V2S16, S32})
1488       .lower();
1489     BuildVector.minScalarOrElt(0, S32);
1490   } else {
1491     BuildVector.customFor({V2S16, S16});
1492     BuildVector.minScalarOrElt(0, S32);
1493 
1494     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495       .customFor({V2S16, S32})
1496       .lower();
1497   }
1498 
1499   BuildVector.legalIf(isRegisterType(0));
1500 
1501   // FIXME: Clamp maximum size
1502   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504     .clampMaxNumElements(0, S32, 32)
1505     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506     .clampMaxNumElements(0, S16, 64);
1507 
1508   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509   // pre-legalize.
1510   if (ST.hasVOP3PInsts()) {
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512       .customFor({V2S16, V2S16})
1513       .lower();
1514   } else
1515     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516 
1517   // Merge/Unmerge
1518   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521 
1522     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523       const LLT Ty = Query.Types[TypeIdx];
1524       if (Ty.isVector()) {
1525         const LLT &EltTy = Ty.getElementType();
1526         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527           return true;
1528         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529           return true;
1530       }
1531       return false;
1532     };
1533 
1534     auto &Builder = getActionDefinitionsBuilder(Op)
1535       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536       .lowerFor({{S16, V2S16}})
1537       .lowerIf([=](const LegalityQuery &Query) {
1538           const LLT BigTy = Query.Types[BigTyIdx];
1539           return BigTy.getSizeInBits() == 32;
1540         })
1541       // Try to widen to s16 first for small types.
1542       // TODO: Only do this on targets with legal s16 shifts
1543       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547                            elementTypeIs(1, S16)),
1548                        changeTo(1, V2S16))
1549       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551       // valid.
1552       .clampScalar(LitTyIdx, S32, S512)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554       // Break up vectors with weird elements into scalars
1555       .fewerElementsIf(
1556         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557         scalarize(0))
1558       .fewerElementsIf(
1559         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560         scalarize(1))
1561       .clampScalar(BigTyIdx, S32, MaxScalar);
1562 
1563     if (Op == G_MERGE_VALUES) {
1564       Builder.widenScalarIf(
1565         // TODO: Use 16-bit shifts if legal for 8-bit values?
1566         [=](const LegalityQuery &Query) {
1567           const LLT Ty = Query.Types[LitTyIdx];
1568           return Ty.getSizeInBits() < 32;
1569         },
1570         changeTo(LitTyIdx, S32));
1571     }
1572 
1573     Builder.widenScalarIf(
1574       [=](const LegalityQuery &Query) {
1575         const LLT Ty = Query.Types[BigTyIdx];
1576         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577           Ty.getSizeInBits() % 16 != 0;
1578       },
1579       [=](const LegalityQuery &Query) {
1580         // Pick the next power of 2, or a multiple of 64 over 128.
1581         // Whichever is smaller.
1582         const LLT &Ty = Query.Types[BigTyIdx];
1583         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584         if (NewSizeInBits >= 256) {
1585           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586           if (RoundedTo < NewSizeInBits)
1587             NewSizeInBits = RoundedTo;
1588         }
1589         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590       })
1591       // Any vectors left are the wrong size. Scalarize them.
1592       .scalarize(0)
1593       .scalarize(1);
1594   }
1595 
1596   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597   // RegBankSelect.
1598   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599     .legalFor({{S32}, {S64}});
1600 
1601   if (ST.hasVOP3PInsts()) {
1602     SextInReg.lowerFor({{V2S16}})
1603       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604       // get more vector shift opportunities, since we'll get those when
1605       // expanded.
1606       .clampMaxNumElementsStrict(0, S16, 2);
1607   } else if (ST.has16BitInsts()) {
1608     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609   } else {
1610     // Prefer to promote to s32 before lowering if we don't have 16-bit
1611     // shifts. This avoid a lot of intermediate truncate and extend operations.
1612     SextInReg.lowerFor({{S32}, {S64}});
1613   }
1614 
1615   SextInReg
1616     .scalarize(0)
1617     .clampScalar(0, S32, S64)
1618     .lower();
1619 
1620   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1621     .scalarize(0)
1622     .lower();
1623 
1624   // TODO: Only Try to form v2s16 with legal packed instructions.
1625   getActionDefinitionsBuilder(G_FSHR)
1626     .legalFor({{S32, S32}})
1627     .lowerFor({{V2S16, V2S16}})
1628     .clampMaxNumElementsStrict(0, S16, 2)
1629     .scalarize(0)
1630     .lower();
1631 
1632   if (ST.hasVOP3PInsts()) {
1633     getActionDefinitionsBuilder(G_FSHL)
1634       .lowerFor({{V2S16, V2S16}})
1635       .clampMaxNumElementsStrict(0, S16, 2)
1636       .scalarize(0)
1637       .lower();
1638   } else {
1639     getActionDefinitionsBuilder(G_FSHL)
1640       .scalarize(0)
1641       .lower();
1642   }
1643 
1644   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1645     .legalFor({S64});
1646 
1647   getActionDefinitionsBuilder(G_FENCE)
1648     .alwaysLegal();
1649 
1650   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1651       .scalarize(0)
1652       .minScalar(0, S32)
1653       .lower();
1654 
1655   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1656       .legalFor({{S32, S32}, {S64, S32}})
1657       .clampScalar(1, S32, S32)
1658       .clampScalar(0, S32, S64)
1659       .widenScalarToNextPow2(0)
1660       .scalarize(0);
1661 
1662   getActionDefinitionsBuilder({
1663       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1664       G_FCOPYSIGN,
1665 
1666       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1667       G_ATOMICRMW_NAND,
1668       G_ATOMICRMW_FSUB,
1669       G_READ_REGISTER,
1670       G_WRITE_REGISTER,
1671 
1672       G_SADDO, G_SSUBO,
1673 
1674        // TODO: Implement
1675       G_FMINIMUM, G_FMAXIMUM}).lower();
1676 
1677   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1678       .lower();
1679 
1680   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1681         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1682         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1683     .unsupported();
1684 
1685   getLegacyLegalizerInfo().computeTables();
1686   verify(*ST.getInstrInfo());
1687 }
1688 
1689 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1690                                          MachineInstr &MI) const {
1691   MachineIRBuilder &B = Helper.MIRBuilder;
1692   MachineRegisterInfo &MRI = *B.getMRI();
1693 
1694   switch (MI.getOpcode()) {
1695   case TargetOpcode::G_ADDRSPACE_CAST:
1696     return legalizeAddrSpaceCast(MI, MRI, B);
1697   case TargetOpcode::G_FRINT:
1698     return legalizeFrint(MI, MRI, B);
1699   case TargetOpcode::G_FCEIL:
1700     return legalizeFceil(MI, MRI, B);
1701   case TargetOpcode::G_FREM:
1702     return legalizeFrem(MI, MRI, B);
1703   case TargetOpcode::G_INTRINSIC_TRUNC:
1704     return legalizeIntrinsicTrunc(MI, MRI, B);
1705   case TargetOpcode::G_SITOFP:
1706     return legalizeITOFP(MI, MRI, B, true);
1707   case TargetOpcode::G_UITOFP:
1708     return legalizeITOFP(MI, MRI, B, false);
1709   case TargetOpcode::G_FPTOSI:
1710     return legalizeFPTOI(MI, MRI, B, true);
1711   case TargetOpcode::G_FPTOUI:
1712     return legalizeFPTOI(MI, MRI, B, false);
1713   case TargetOpcode::G_FMINNUM:
1714   case TargetOpcode::G_FMAXNUM:
1715   case TargetOpcode::G_FMINNUM_IEEE:
1716   case TargetOpcode::G_FMAXNUM_IEEE:
1717     return legalizeMinNumMaxNum(Helper, MI);
1718   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1719     return legalizeExtractVectorElt(MI, MRI, B);
1720   case TargetOpcode::G_INSERT_VECTOR_ELT:
1721     return legalizeInsertVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_SHUFFLE_VECTOR:
1723     return legalizeShuffleVector(MI, MRI, B);
1724   case TargetOpcode::G_FSIN:
1725   case TargetOpcode::G_FCOS:
1726     return legalizeSinCos(MI, MRI, B);
1727   case TargetOpcode::G_GLOBAL_VALUE:
1728     return legalizeGlobalValue(MI, MRI, B);
1729   case TargetOpcode::G_LOAD:
1730   case TargetOpcode::G_SEXTLOAD:
1731   case TargetOpcode::G_ZEXTLOAD:
1732     return legalizeLoad(Helper, MI);
1733   case TargetOpcode::G_FMAD:
1734     return legalizeFMad(MI, MRI, B);
1735   case TargetOpcode::G_FDIV:
1736     return legalizeFDIV(MI, MRI, B);
1737   case TargetOpcode::G_UDIV:
1738   case TargetOpcode::G_UREM:
1739   case TargetOpcode::G_UDIVREM:
1740     return legalizeUnsignedDIV_REM(MI, MRI, B);
1741   case TargetOpcode::G_SDIV:
1742   case TargetOpcode::G_SREM:
1743   case TargetOpcode::G_SDIVREM:
1744     return legalizeSignedDIV_REM(MI, MRI, B);
1745   case TargetOpcode::G_ATOMIC_CMPXCHG:
1746     return legalizeAtomicCmpXChg(MI, MRI, B);
1747   case TargetOpcode::G_FLOG:
1748     return legalizeFlog(MI, B, numbers::ln2f);
1749   case TargetOpcode::G_FLOG10:
1750     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1751   case TargetOpcode::G_FEXP:
1752     return legalizeFExp(MI, B);
1753   case TargetOpcode::G_FPOW:
1754     return legalizeFPow(MI, B);
1755   case TargetOpcode::G_FFLOOR:
1756     return legalizeFFloor(MI, MRI, B);
1757   case TargetOpcode::G_BUILD_VECTOR:
1758     return legalizeBuildVector(MI, MRI, B);
1759   case TargetOpcode::G_CTLZ:
1760   case TargetOpcode::G_CTTZ:
1761     return legalizeCTLZ_CTTZ(MI, MRI, B);
1762   default:
1763     return false;
1764   }
1765 
1766   llvm_unreachable("expected switch to return");
1767 }
1768 
1769 Register AMDGPULegalizerInfo::getSegmentAperture(
1770   unsigned AS,
1771   MachineRegisterInfo &MRI,
1772   MachineIRBuilder &B) const {
1773   MachineFunction &MF = B.getMF();
1774   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1775   const LLT S32 = LLT::scalar(32);
1776 
1777   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1778 
1779   if (ST.hasApertureRegs()) {
1780     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1781     // getreg.
1782     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1783         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1784         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1785     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1786         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1787         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1788     unsigned Encoding =
1789         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1790         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1791         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1792 
1793     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1794 
1795     B.buildInstr(AMDGPU::S_GETREG_B32)
1796       .addDef(GetReg)
1797       .addImm(Encoding);
1798     MRI.setType(GetReg, S32);
1799 
1800     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1801     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1802   }
1803 
1804   Register QueuePtr = MRI.createGenericVirtualRegister(
1805     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1806 
1807   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1808     return Register();
1809 
1810   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1811   // private_segment_aperture_base_hi.
1812   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1813 
1814   // TODO: can we be smarter about machine pointer info?
1815   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1816   MachineMemOperand *MMO = MF.getMachineMemOperand(
1817       PtrInfo,
1818       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1819           MachineMemOperand::MOInvariant,
1820       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1821 
1822   Register LoadAddr;
1823 
1824   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1825   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1826 }
1827 
1828 /// Return true if the value is a known valid address, such that a null check is
1829 /// not necessary.
1830 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1831                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1832   MachineInstr *Def = MRI.getVRegDef(Val);
1833   switch (Def->getOpcode()) {
1834   case AMDGPU::G_FRAME_INDEX:
1835   case AMDGPU::G_GLOBAL_VALUE:
1836   case AMDGPU::G_BLOCK_ADDR:
1837     return true;
1838   case AMDGPU::G_CONSTANT: {
1839     const ConstantInt *CI = Def->getOperand(1).getCImm();
1840     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1841   }
1842   default:
1843     return false;
1844   }
1845 
1846   return false;
1847 }
1848 
1849 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1850   MachineInstr &MI, MachineRegisterInfo &MRI,
1851   MachineIRBuilder &B) const {
1852   MachineFunction &MF = B.getMF();
1853 
1854   const LLT S32 = LLT::scalar(32);
1855   Register Dst = MI.getOperand(0).getReg();
1856   Register Src = MI.getOperand(1).getReg();
1857 
1858   LLT DstTy = MRI.getType(Dst);
1859   LLT SrcTy = MRI.getType(Src);
1860   unsigned DestAS = DstTy.getAddressSpace();
1861   unsigned SrcAS = SrcTy.getAddressSpace();
1862 
1863   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1864   // vector element.
1865   assert(!DstTy.isVector());
1866 
1867   const AMDGPUTargetMachine &TM
1868     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1869 
1870   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1871     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1872     return true;
1873   }
1874 
1875   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1876       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1877        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1878     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1879       // Extract low 32-bits of the pointer.
1880       B.buildExtract(Dst, Src, 0);
1881       MI.eraseFromParent();
1882       return true;
1883     }
1884 
1885     unsigned NullVal = TM.getNullPointerValue(DestAS);
1886 
1887     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1888     auto FlatNull = B.buildConstant(SrcTy, 0);
1889 
1890     // Extract low 32-bits of the pointer.
1891     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1892 
1893     auto CmpRes =
1894         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1895     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1896 
1897     MI.eraseFromParent();
1898     return true;
1899   }
1900 
1901   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1902       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1903        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1904     if (!ST.hasFlatAddressSpace())
1905       return false;
1906 
1907     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908     if (!ApertureReg.isValid())
1909       return false;
1910 
1911     // Coerce the type of the low half of the result so we can use merge_values.
1912     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1913 
1914     // TODO: Should we allow mismatched types but matching sizes in merges to
1915     // avoid the ptrtoint?
1916     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1917 
1918     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1919       B.buildCopy(Dst, BuildPtr);
1920       MI.eraseFromParent();
1921       return true;
1922     }
1923 
1924     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1925     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1926 
1927     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1928                               SegmentNull.getReg(0));
1929 
1930     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1931 
1932     MI.eraseFromParent();
1933     return true;
1934   }
1935 
1936   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1937       SrcTy.getSizeInBits() == 64) {
1938     // Truncate.
1939     B.buildExtract(Dst, Src, 0);
1940     MI.eraseFromParent();
1941     return true;
1942   }
1943 
1944   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1945       DstTy.getSizeInBits() == 64) {
1946     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1947     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1948 
1949     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1950     // another. Merge operands are required to be the same type, but creating an
1951     // extra ptrtoint would be kind of pointless.
1952     auto HighAddr = B.buildConstant(
1953         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1954     B.buildMerge(Dst, {Src, HighAddr});
1955     MI.eraseFromParent();
1956     return true;
1957   }
1958 
1959   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1960       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
1961 
1962   LLVMContext &Ctx = MF.getFunction().getContext();
1963   Ctx.diagnose(InvalidAddrSpaceCast);
1964   B.buildUndef(Dst);
1965   MI.eraseFromParent();
1966   return true;
1967 }
1968 
1969 bool AMDGPULegalizerInfo::legalizeFrint(
1970   MachineInstr &MI, MachineRegisterInfo &MRI,
1971   MachineIRBuilder &B) const {
1972   Register Src = MI.getOperand(1).getReg();
1973   LLT Ty = MRI.getType(Src);
1974   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1975 
1976   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1977   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1978 
1979   auto C1 = B.buildFConstant(Ty, C1Val);
1980   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1981 
1982   // TODO: Should this propagate fast-math-flags?
1983   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1984   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1985 
1986   auto C2 = B.buildFConstant(Ty, C2Val);
1987   auto Fabs = B.buildFAbs(Ty, Src);
1988 
1989   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1990   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1991   MI.eraseFromParent();
1992   return true;
1993 }
1994 
1995 bool AMDGPULegalizerInfo::legalizeFceil(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B) const {
1998 
1999   const LLT S1 = LLT::scalar(1);
2000   const LLT S64 = LLT::scalar(64);
2001 
2002   Register Src = MI.getOperand(1).getReg();
2003   assert(MRI.getType(Src) == S64);
2004 
2005   // result = trunc(src)
2006   // if (src > 0.0 && src != result)
2007   //   result += 1.0
2008 
2009   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2010 
2011   const auto Zero = B.buildFConstant(S64, 0.0);
2012   const auto One = B.buildFConstant(S64, 1.0);
2013   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2014   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2015   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2016   auto Add = B.buildSelect(S64, And, One, Zero);
2017 
2018   // TODO: Should this propagate fast-math-flags?
2019   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2020   MI.eraseFromParent();
2021   return true;
2022 }
2023 
2024 bool AMDGPULegalizerInfo::legalizeFrem(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B) const {
2027     Register DstReg = MI.getOperand(0).getReg();
2028     Register Src0Reg = MI.getOperand(1).getReg();
2029     Register Src1Reg = MI.getOperand(2).getReg();
2030     auto Flags = MI.getFlags();
2031     LLT Ty = MRI.getType(DstReg);
2032 
2033     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2034     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2035     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2036     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2037     MI.eraseFromParent();
2038     return true;
2039 }
2040 
2041 static MachineInstrBuilder extractF64Exponent(Register Hi,
2042                                               MachineIRBuilder &B) {
2043   const unsigned FractBits = 52;
2044   const unsigned ExpBits = 11;
2045   LLT S32 = LLT::scalar(32);
2046 
2047   auto Const0 = B.buildConstant(S32, FractBits - 32);
2048   auto Const1 = B.buildConstant(S32, ExpBits);
2049 
2050   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2051     .addUse(Hi)
2052     .addUse(Const0.getReg(0))
2053     .addUse(Const1.getReg(0));
2054 
2055   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2056 }
2057 
2058 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2059   MachineInstr &MI, MachineRegisterInfo &MRI,
2060   MachineIRBuilder &B) const {
2061   const LLT S1 = LLT::scalar(1);
2062   const LLT S32 = LLT::scalar(32);
2063   const LLT S64 = LLT::scalar(64);
2064 
2065   Register Src = MI.getOperand(1).getReg();
2066   assert(MRI.getType(Src) == S64);
2067 
2068   // TODO: Should this use extract since the low half is unused?
2069   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2070   Register Hi = Unmerge.getReg(1);
2071 
2072   // Extract the upper half, since this is where we will find the sign and
2073   // exponent.
2074   auto Exp = extractF64Exponent(Hi, B);
2075 
2076   const unsigned FractBits = 52;
2077 
2078   // Extract the sign bit.
2079   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2080   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2081 
2082   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2083 
2084   const auto Zero32 = B.buildConstant(S32, 0);
2085 
2086   // Extend back to 64-bits.
2087   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2088 
2089   auto Shr = B.buildAShr(S64, FractMask, Exp);
2090   auto Not = B.buildNot(S64, Shr);
2091   auto Tmp0 = B.buildAnd(S64, Src, Not);
2092   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2093 
2094   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2095   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2096 
2097   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2098   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2099   MI.eraseFromParent();
2100   return true;
2101 }
2102 
2103 bool AMDGPULegalizerInfo::legalizeITOFP(
2104   MachineInstr &MI, MachineRegisterInfo &MRI,
2105   MachineIRBuilder &B, bool Signed) const {
2106 
2107   Register Dst = MI.getOperand(0).getReg();
2108   Register Src = MI.getOperand(1).getReg();
2109 
2110   const LLT S64 = LLT::scalar(64);
2111   const LLT S32 = LLT::scalar(32);
2112 
2113   assert(MRI.getType(Src) == S64);
2114 
2115   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2116   auto ThirtyTwo = B.buildConstant(S32, 32);
2117 
2118   if (MRI.getType(Dst) == S64) {
2119     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2120                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2121 
2122     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2123     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2124                      .addUse(CvtHi.getReg(0))
2125                      .addUse(ThirtyTwo.getReg(0));
2126 
2127     // TODO: Should this propagate fast-math-flags?
2128     B.buildFAdd(Dst, LdExp, CvtLo);
2129     MI.eraseFromParent();
2130     return true;
2131   }
2132 
2133   assert(MRI.getType(Dst) == S32);
2134 
2135   auto One = B.buildConstant(S32, 1);
2136 
2137   MachineInstrBuilder ShAmt;
2138   if (Signed) {
2139     auto ThirtyOne = B.buildConstant(S32, 31);
2140     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2141     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2142     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2143     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2144                                /*HasSideEffects=*/false)
2145                   .addUse(Unmerge.getReg(1));
2146     auto LS2 = B.buildSub(S32, LS, One);
2147     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2148   } else
2149     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2150   auto Norm = B.buildShl(S64, Src, ShAmt);
2151   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2152   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2153   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2154   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2155   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2156   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2157                    /*HasSideEffects=*/false)
2158       .addUse(FVal.getReg(0))
2159       .addUse(Scale.getReg(0));
2160   MI.eraseFromParent();
2161   return true;
2162 }
2163 
2164 // TODO: Copied from DAG implementation. Verify logic and document how this
2165 // actually works.
2166 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2167                                         MachineRegisterInfo &MRI,
2168                                         MachineIRBuilder &B,
2169                                         bool Signed) const {
2170 
2171   Register Dst = MI.getOperand(0).getReg();
2172   Register Src = MI.getOperand(1).getReg();
2173 
2174   const LLT S64 = LLT::scalar(64);
2175   const LLT S32 = LLT::scalar(32);
2176 
2177   const LLT SrcLT = MRI.getType(Src);
2178   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2179 
2180   unsigned Flags = MI.getFlags();
2181 
2182   // The basic idea of converting a floating point number into a pair of 32-bit
2183   // integers is illustrated as follows:
2184   //
2185   //     tf := trunc(val);
2186   //    hif := floor(tf * 2^-32);
2187   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2188   //     hi := fptoi(hif);
2189   //     lo := fptoi(lof);
2190   //
2191   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2192   MachineInstrBuilder Sign;
2193   if (Signed && SrcLT == S32) {
2194     // However, a 32-bit floating point number has only 23 bits mantissa and
2195     // it's not enough to hold all the significant bits of `lof` if val is
2196     // negative. To avoid the loss of precision, We need to take the absolute
2197     // value after truncating and flip the result back based on the original
2198     // signedness.
2199     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2200     Trunc = B.buildFAbs(S32, Trunc, Flags);
2201   }
2202   MachineInstrBuilder K0, K1;
2203   if (SrcLT == S64) {
2204     K0 = B.buildFConstant(S64,
2205                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2206     K1 = B.buildFConstant(S64,
2207                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2208   } else {
2209     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2210     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2211   }
2212 
2213   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2214   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2215   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2216 
2217   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2218                                      : B.buildFPTOUI(S32, FloorMul);
2219   auto Lo = B.buildFPTOUI(S32, Fma);
2220 
2221   if (Signed && SrcLT == S32) {
2222     // Flip the result based on the signedness, which is either all 0s or 1s.
2223     Sign = B.buildMerge(S64, {Sign, Sign});
2224     // r := xor({lo, hi}, sign) - sign;
2225     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2226   } else
2227     B.buildMerge(Dst, {Lo, Hi});
2228   MI.eraseFromParent();
2229 
2230   return true;
2231 }
2232 
2233 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2234                                                MachineInstr &MI) const {
2235   MachineFunction &MF = Helper.MIRBuilder.getMF();
2236   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2237 
2238   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2239                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2240 
2241   // With ieee_mode disabled, the instructions have the correct behavior
2242   // already for G_FMINNUM/G_FMAXNUM
2243   if (!MFI->getMode().IEEE)
2244     return !IsIEEEOp;
2245 
2246   if (IsIEEEOp)
2247     return true;
2248 
2249   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2250 }
2251 
2252 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2253   MachineInstr &MI, MachineRegisterInfo &MRI,
2254   MachineIRBuilder &B) const {
2255   // TODO: Should move some of this into LegalizerHelper.
2256 
2257   // TODO: Promote dynamic indexing of s16 to s32
2258 
2259   // FIXME: Artifact combiner probably should have replaced the truncated
2260   // constant before this, so we shouldn't need
2261   // getIConstantVRegValWithLookThrough.
2262   Optional<ValueAndVReg> MaybeIdxVal =
2263       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2264   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2265     return true;
2266   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2267 
2268   Register Dst = MI.getOperand(0).getReg();
2269   Register Vec = MI.getOperand(1).getReg();
2270 
2271   LLT VecTy = MRI.getType(Vec);
2272   LLT EltTy = VecTy.getElementType();
2273   assert(EltTy == MRI.getType(Dst));
2274 
2275   if (IdxVal < VecTy.getNumElements()) {
2276     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2277     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2278   } else {
2279     B.buildUndef(Dst);
2280   }
2281 
2282   MI.eraseFromParent();
2283   return true;
2284 }
2285 
2286 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2287   MachineInstr &MI, MachineRegisterInfo &MRI,
2288   MachineIRBuilder &B) const {
2289   // TODO: Should move some of this into LegalizerHelper.
2290 
2291   // TODO: Promote dynamic indexing of s16 to s32
2292 
2293   // FIXME: Artifact combiner probably should have replaced the truncated
2294   // constant before this, so we shouldn't need
2295   // getIConstantVRegValWithLookThrough.
2296   Optional<ValueAndVReg> MaybeIdxVal =
2297       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2298   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2299     return true;
2300 
2301   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2302   Register Dst = MI.getOperand(0).getReg();
2303   Register Vec = MI.getOperand(1).getReg();
2304   Register Ins = MI.getOperand(2).getReg();
2305 
2306   LLT VecTy = MRI.getType(Vec);
2307   LLT EltTy = VecTy.getElementType();
2308   assert(EltTy == MRI.getType(Ins));
2309   (void)Ins;
2310 
2311   unsigned NumElts = VecTy.getNumElements();
2312   if (IdxVal < NumElts) {
2313     SmallVector<Register, 8> SrcRegs;
2314     for (unsigned i = 0; i < NumElts; ++i)
2315       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2316     B.buildUnmerge(SrcRegs, Vec);
2317 
2318     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2319     B.buildMerge(Dst, SrcRegs);
2320   } else {
2321     B.buildUndef(Dst);
2322   }
2323 
2324   MI.eraseFromParent();
2325   return true;
2326 }
2327 
2328 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2329   MachineInstr &MI, MachineRegisterInfo &MRI,
2330   MachineIRBuilder &B) const {
2331   const LLT V2S16 = LLT::fixed_vector(2, 16);
2332 
2333   Register Dst = MI.getOperand(0).getReg();
2334   Register Src0 = MI.getOperand(1).getReg();
2335   LLT DstTy = MRI.getType(Dst);
2336   LLT SrcTy = MRI.getType(Src0);
2337 
2338   if (SrcTy == V2S16 && DstTy == V2S16 &&
2339       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2340     return true;
2341 
2342   MachineIRBuilder HelperBuilder(MI);
2343   GISelObserverWrapper DummyObserver;
2344   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2345   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2346 }
2347 
2348 bool AMDGPULegalizerInfo::legalizeSinCos(
2349   MachineInstr &MI, MachineRegisterInfo &MRI,
2350   MachineIRBuilder &B) const {
2351 
2352   Register DstReg = MI.getOperand(0).getReg();
2353   Register SrcReg = MI.getOperand(1).getReg();
2354   LLT Ty = MRI.getType(DstReg);
2355   unsigned Flags = MI.getFlags();
2356 
2357   Register TrigVal;
2358   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2359   if (ST.hasTrigReducedRange()) {
2360     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2361     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2362       .addUse(MulVal.getReg(0))
2363       .setMIFlags(Flags).getReg(0);
2364   } else
2365     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2366 
2367   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2368     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2369   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2370     .addUse(TrigVal)
2371     .setMIFlags(Flags);
2372   MI.eraseFromParent();
2373   return true;
2374 }
2375 
2376 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2377                                                   MachineIRBuilder &B,
2378                                                   const GlobalValue *GV,
2379                                                   int64_t Offset,
2380                                                   unsigned GAFlags) const {
2381   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2382   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2383   // to the following code sequence:
2384   //
2385   // For constant address space:
2386   //   s_getpc_b64 s[0:1]
2387   //   s_add_u32 s0, s0, $symbol
2388   //   s_addc_u32 s1, s1, 0
2389   //
2390   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2391   //   a fixup or relocation is emitted to replace $symbol with a literal
2392   //   constant, which is a pc-relative offset from the encoding of the $symbol
2393   //   operand to the global variable.
2394   //
2395   // For global address space:
2396   //   s_getpc_b64 s[0:1]
2397   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2398   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2399   //
2400   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2401   //   fixups or relocations are emitted to replace $symbol@*@lo and
2402   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2403   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2404   //   operand to the global variable.
2405   //
2406   // What we want here is an offset from the value returned by s_getpc
2407   // (which is the address of the s_add_u32 instruction) to the global
2408   // variable, but since the encoding of $symbol starts 4 bytes after the start
2409   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2410   // small. This requires us to add 4 to the global variable offset in order to
2411   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2412   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2413   // instruction.
2414 
2415   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2416 
2417   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2418     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2419 
2420   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2421     .addDef(PCReg);
2422 
2423   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2424   if (GAFlags == SIInstrInfo::MO_NONE)
2425     MIB.addImm(0);
2426   else
2427     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2428 
2429   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2430 
2431   if (PtrTy.getSizeInBits() == 32)
2432     B.buildExtract(DstReg, PCReg, 0);
2433   return true;
2434  }
2435 
2436 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2437   MachineInstr &MI, MachineRegisterInfo &MRI,
2438   MachineIRBuilder &B) const {
2439   Register DstReg = MI.getOperand(0).getReg();
2440   LLT Ty = MRI.getType(DstReg);
2441   unsigned AS = Ty.getAddressSpace();
2442 
2443   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2444   MachineFunction &MF = B.getMF();
2445   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2446 
2447   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2448     if (!MFI->isModuleEntryFunction() &&
2449         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2450       const Function &Fn = MF.getFunction();
2451       DiagnosticInfoUnsupported BadLDSDecl(
2452         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2453         DS_Warning);
2454       Fn.getContext().diagnose(BadLDSDecl);
2455 
2456       // We currently don't have a way to correctly allocate LDS objects that
2457       // aren't directly associated with a kernel. We do force inlining of
2458       // functions that use local objects. However, if these dead functions are
2459       // not eliminated, we don't want a compile time error. Just emit a warning
2460       // and a trap, since there should be no callable path here.
2461       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2462       B.buildUndef(DstReg);
2463       MI.eraseFromParent();
2464       return true;
2465     }
2466 
2467     // TODO: We could emit code to handle the initialization somewhere.
2468     // We ignore the initializer for now and legalize it to allow selection.
2469     // The initializer will anyway get errored out during assembly emission.
2470     const SITargetLowering *TLI = ST.getTargetLowering();
2471     if (!TLI->shouldUseLDSConstAddress(GV)) {
2472       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2473       return true; // Leave in place;
2474     }
2475 
2476     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2477       Type *Ty = GV->getValueType();
2478       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2479       // zero-sized type in other languages to declare the dynamic shared
2480       // memory which size is not known at the compile time. They will be
2481       // allocated by the runtime and placed directly after the static
2482       // allocated ones. They all share the same offset.
2483       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2484         // Adjust alignment for that dynamic shared memory array.
2485         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2486         LLT S32 = LLT::scalar(32);
2487         auto Sz =
2488             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2489         B.buildIntToPtr(DstReg, Sz);
2490         MI.eraseFromParent();
2491         return true;
2492       }
2493     }
2494 
2495     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2496                                                    *cast<GlobalVariable>(GV)));
2497     MI.eraseFromParent();
2498     return true;
2499   }
2500 
2501   const SITargetLowering *TLI = ST.getTargetLowering();
2502 
2503   if (TLI->shouldEmitFixup(GV)) {
2504     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2505     MI.eraseFromParent();
2506     return true;
2507   }
2508 
2509   if (TLI->shouldEmitPCReloc(GV)) {
2510     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2511     MI.eraseFromParent();
2512     return true;
2513   }
2514 
2515   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2516   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2517 
2518   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2519   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2520       MachinePointerInfo::getGOT(MF),
2521       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2522           MachineMemOperand::MOInvariant,
2523       LoadTy, Align(8));
2524 
2525   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2526 
2527   if (Ty.getSizeInBits() == 32) {
2528     // Truncate if this is a 32-bit constant address.
2529     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2530     B.buildExtract(DstReg, Load, 0);
2531   } else
2532     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2533 
2534   MI.eraseFromParent();
2535   return true;
2536 }
2537 
2538 static LLT widenToNextPowerOf2(LLT Ty) {
2539   if (Ty.isVector())
2540     return Ty.changeElementCount(
2541         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2542   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2543 }
2544 
2545 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2546                                        MachineInstr &MI) const {
2547   MachineIRBuilder &B = Helper.MIRBuilder;
2548   MachineRegisterInfo &MRI = *B.getMRI();
2549   GISelChangeObserver &Observer = Helper.Observer;
2550 
2551   Register PtrReg = MI.getOperand(1).getReg();
2552   LLT PtrTy = MRI.getType(PtrReg);
2553   unsigned AddrSpace = PtrTy.getAddressSpace();
2554 
2555   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2556     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2557     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2558     Observer.changingInstr(MI);
2559     MI.getOperand(1).setReg(Cast.getReg(0));
2560     Observer.changedInstr(MI);
2561     return true;
2562   }
2563 
2564   if (MI.getOpcode() != AMDGPU::G_LOAD)
2565     return false;
2566 
2567   Register ValReg = MI.getOperand(0).getReg();
2568   LLT ValTy = MRI.getType(ValReg);
2569 
2570   MachineMemOperand *MMO = *MI.memoperands_begin();
2571   const unsigned ValSize = ValTy.getSizeInBits();
2572   const LLT MemTy = MMO->getMemoryType();
2573   const Align MemAlign = MMO->getAlign();
2574   const unsigned MemSize = MemTy.getSizeInBits();
2575   const uint64_t AlignInBits = 8 * MemAlign.value();
2576 
2577   // Widen non-power-of-2 loads to the alignment if needed
2578   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2579     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2580 
2581     // This was already the correct extending load result type, so just adjust
2582     // the memory type.
2583     if (WideMemSize == ValSize) {
2584       MachineFunction &MF = B.getMF();
2585 
2586       MachineMemOperand *WideMMO =
2587           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2588       Observer.changingInstr(MI);
2589       MI.setMemRefs(MF, {WideMMO});
2590       Observer.changedInstr(MI);
2591       return true;
2592     }
2593 
2594     // Don't bother handling edge case that should probably never be produced.
2595     if (ValSize > WideMemSize)
2596       return false;
2597 
2598     LLT WideTy = widenToNextPowerOf2(ValTy);
2599 
2600     Register WideLoad;
2601     if (!WideTy.isVector()) {
2602       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2603       B.buildTrunc(ValReg, WideLoad).getReg(0);
2604     } else {
2605       // Extract the subvector.
2606 
2607       if (isRegisterType(ValTy)) {
2608         // If this a case where G_EXTRACT is legal, use it.
2609         // (e.g. <3 x s32> -> <4 x s32>)
2610         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2611         B.buildExtract(ValReg, WideLoad, 0);
2612       } else {
2613         // For cases where the widened type isn't a nice register value, unmerge
2614         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2615         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2616         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2617       }
2618     }
2619 
2620     MI.eraseFromParent();
2621     return true;
2622   }
2623 
2624   return false;
2625 }
2626 
2627 bool AMDGPULegalizerInfo::legalizeFMad(
2628   MachineInstr &MI, MachineRegisterInfo &MRI,
2629   MachineIRBuilder &B) const {
2630   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2631   assert(Ty.isScalar());
2632 
2633   MachineFunction &MF = B.getMF();
2634   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2635 
2636   // TODO: Always legal with future ftz flag.
2637   // FIXME: Do we need just output?
2638   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2639     return true;
2640   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2641     return true;
2642 
2643   MachineIRBuilder HelperBuilder(MI);
2644   GISelObserverWrapper DummyObserver;
2645   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2646   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2647 }
2648 
2649 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2650   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2651   Register DstReg = MI.getOperand(0).getReg();
2652   Register PtrReg = MI.getOperand(1).getReg();
2653   Register CmpVal = MI.getOperand(2).getReg();
2654   Register NewVal = MI.getOperand(3).getReg();
2655 
2656   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2657          "this should not have been custom lowered");
2658 
2659   LLT ValTy = MRI.getType(CmpVal);
2660   LLT VecTy = LLT::fixed_vector(2, ValTy);
2661 
2662   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2663 
2664   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2665     .addDef(DstReg)
2666     .addUse(PtrReg)
2667     .addUse(PackedVal)
2668     .setMemRefs(MI.memoperands());
2669 
2670   MI.eraseFromParent();
2671   return true;
2672 }
2673 
2674 bool AMDGPULegalizerInfo::legalizeFlog(
2675   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2676   Register Dst = MI.getOperand(0).getReg();
2677   Register Src = MI.getOperand(1).getReg();
2678   LLT Ty = B.getMRI()->getType(Dst);
2679   unsigned Flags = MI.getFlags();
2680 
2681   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2682   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2683 
2684   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2685   MI.eraseFromParent();
2686   return true;
2687 }
2688 
2689 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2690                                        MachineIRBuilder &B) const {
2691   Register Dst = MI.getOperand(0).getReg();
2692   Register Src = MI.getOperand(1).getReg();
2693   unsigned Flags = MI.getFlags();
2694   LLT Ty = B.getMRI()->getType(Dst);
2695 
2696   auto K = B.buildFConstant(Ty, numbers::log2e);
2697   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2698   B.buildFExp2(Dst, Mul, Flags);
2699   MI.eraseFromParent();
2700   return true;
2701 }
2702 
2703 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2704                                        MachineIRBuilder &B) const {
2705   Register Dst = MI.getOperand(0).getReg();
2706   Register Src0 = MI.getOperand(1).getReg();
2707   Register Src1 = MI.getOperand(2).getReg();
2708   unsigned Flags = MI.getFlags();
2709   LLT Ty = B.getMRI()->getType(Dst);
2710   const LLT S16 = LLT::scalar(16);
2711   const LLT S32 = LLT::scalar(32);
2712 
2713   if (Ty == S32) {
2714     auto Log = B.buildFLog2(S32, Src0, Flags);
2715     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2716       .addUse(Log.getReg(0))
2717       .addUse(Src1)
2718       .setMIFlags(Flags);
2719     B.buildFExp2(Dst, Mul, Flags);
2720   } else if (Ty == S16) {
2721     // There's no f16 fmul_legacy, so we need to convert for it.
2722     auto Log = B.buildFLog2(S16, Src0, Flags);
2723     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2724     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2725     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2726       .addUse(Ext0.getReg(0))
2727       .addUse(Ext1.getReg(0))
2728       .setMIFlags(Flags);
2729 
2730     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2731   } else
2732     return false;
2733 
2734   MI.eraseFromParent();
2735   return true;
2736 }
2737 
2738 // Find a source register, ignoring any possible source modifiers.
2739 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2740   Register ModSrc = OrigSrc;
2741   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2742     ModSrc = SrcFNeg->getOperand(1).getReg();
2743     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2744       ModSrc = SrcFAbs->getOperand(1).getReg();
2745   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2746     ModSrc = SrcFAbs->getOperand(1).getReg();
2747   return ModSrc;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2751                                          MachineRegisterInfo &MRI,
2752                                          MachineIRBuilder &B) const {
2753 
2754   const LLT S1 = LLT::scalar(1);
2755   const LLT S64 = LLT::scalar(64);
2756   Register Dst = MI.getOperand(0).getReg();
2757   Register OrigSrc = MI.getOperand(1).getReg();
2758   unsigned Flags = MI.getFlags();
2759   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2760          "this should not have been custom lowered");
2761 
2762   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2763   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2764   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2765   // V_FRACT bug is:
2766   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2767   //
2768   // Convert floor(x) to (x - fract(x))
2769 
2770   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2771     .addUse(OrigSrc)
2772     .setMIFlags(Flags);
2773 
2774   // Give source modifier matching some assistance before obscuring a foldable
2775   // pattern.
2776 
2777   // TODO: We can avoid the neg on the fract? The input sign to fract
2778   // shouldn't matter?
2779   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2780 
2781   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2782 
2783   Register Min = MRI.createGenericVirtualRegister(S64);
2784 
2785   // We don't need to concern ourselves with the snan handling difference, so
2786   // use the one which will directly select.
2787   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2788   if (MFI->getMode().IEEE)
2789     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2790   else
2791     B.buildFMinNum(Min, Fract, Const, Flags);
2792 
2793   Register CorrectedFract = Min;
2794   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2795     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2796     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2797   }
2798 
2799   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2800   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2801 
2802   MI.eraseFromParent();
2803   return true;
2804 }
2805 
2806 // Turn an illegal packed v2s16 build vector into bit operations.
2807 // TODO: This should probably be a bitcast action in LegalizerHelper.
2808 bool AMDGPULegalizerInfo::legalizeBuildVector(
2809   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2810   Register Dst = MI.getOperand(0).getReg();
2811   const LLT S32 = LLT::scalar(32);
2812   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2813 
2814   Register Src0 = MI.getOperand(1).getReg();
2815   Register Src1 = MI.getOperand(2).getReg();
2816   assert(MRI.getType(Src0) == LLT::scalar(16));
2817 
2818   auto Merge = B.buildMerge(S32, {Src0, Src1});
2819   B.buildBitcast(Dst, Merge);
2820 
2821   MI.eraseFromParent();
2822   return true;
2823 }
2824 
2825 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2826 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2827 // case with a single min instruction instead of a compare+select.
2828 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2829                                             MachineRegisterInfo &MRI,
2830                                             MachineIRBuilder &B) const {
2831   Register Dst = MI.getOperand(0).getReg();
2832   Register Src = MI.getOperand(1).getReg();
2833   LLT DstTy = MRI.getType(Dst);
2834   LLT SrcTy = MRI.getType(Src);
2835 
2836   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2837                         ? AMDGPU::G_AMDGPU_FFBH_U32
2838                         : AMDGPU::G_AMDGPU_FFBL_B32;
2839   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2840   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2841 
2842   MI.eraseFromParent();
2843   return true;
2844 }
2845 
2846 // Check that this is a G_XOR x, -1
2847 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2848   if (MI.getOpcode() != TargetOpcode::G_XOR)
2849     return false;
2850   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2851   return ConstVal && *ConstVal == -1;
2852 }
2853 
2854 // Return the use branch instruction, otherwise null if the usage is invalid.
2855 static MachineInstr *
2856 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2857                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2858   Register CondDef = MI.getOperand(0).getReg();
2859   if (!MRI.hasOneNonDBGUse(CondDef))
2860     return nullptr;
2861 
2862   MachineBasicBlock *Parent = MI.getParent();
2863   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2864 
2865   if (isNot(MRI, *UseMI)) {
2866     Register NegatedCond = UseMI->getOperand(0).getReg();
2867     if (!MRI.hasOneNonDBGUse(NegatedCond))
2868       return nullptr;
2869 
2870     // We're deleting the def of this value, so we need to remove it.
2871     eraseInstr(*UseMI, MRI);
2872 
2873     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2874     Negated = true;
2875   }
2876 
2877   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2878     return nullptr;
2879 
2880   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2881   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2882   if (Next == Parent->end()) {
2883     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2884     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2885       return nullptr;
2886     UncondBrTarget = &*NextMBB;
2887   } else {
2888     if (Next->getOpcode() != AMDGPU::G_BR)
2889       return nullptr;
2890     Br = &*Next;
2891     UncondBrTarget = Br->getOperand(0).getMBB();
2892   }
2893 
2894   return UseMI;
2895 }
2896 
2897 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2898                                          const ArgDescriptor *Arg,
2899                                          const TargetRegisterClass *ArgRC,
2900                                          LLT ArgTy) const {
2901   MCRegister SrcReg = Arg->getRegister();
2902   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2903   assert(DstReg.isVirtual() && "Virtual register expected");
2904 
2905   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2906                                              *ArgRC, B.getDebugLoc(), ArgTy);
2907   if (Arg->isMasked()) {
2908     // TODO: Should we try to emit this once in the entry block?
2909     const LLT S32 = LLT::scalar(32);
2910     const unsigned Mask = Arg->getMask();
2911     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2912 
2913     Register AndMaskSrc = LiveIn;
2914 
2915     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2916     // 0.
2917     if (Shift != 0) {
2918       auto ShiftAmt = B.buildConstant(S32, Shift);
2919       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2920     }
2921 
2922     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2923   } else {
2924     B.buildCopy(DstReg, LiveIn);
2925   }
2926 
2927   return true;
2928 }
2929 
2930 bool AMDGPULegalizerInfo::loadInputValue(
2931     Register DstReg, MachineIRBuilder &B,
2932     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2933   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2934   const ArgDescriptor *Arg;
2935   const TargetRegisterClass *ArgRC;
2936   LLT ArgTy;
2937   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2938 
2939   if (!Arg) {
2940     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2941       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2942       // case the pointer argument may be missing and we use null.
2943       B.buildConstant(DstReg, 0);
2944       return true;
2945     }
2946 
2947     // It's undefined behavior if a function marked with the amdgpu-no-*
2948     // attributes uses the corresponding intrinsic.
2949     B.buildUndef(DstReg);
2950     return true;
2951   }
2952 
2953   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2954     return false; // TODO: Handle these
2955   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2956 }
2957 
2958 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2959     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2960     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2961   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2962     return false;
2963 
2964   MI.eraseFromParent();
2965   return true;
2966 }
2967 
2968 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2969                                        MachineRegisterInfo &MRI,
2970                                        MachineIRBuilder &B) const {
2971   Register Dst = MI.getOperand(0).getReg();
2972   LLT DstTy = MRI.getType(Dst);
2973   LLT S16 = LLT::scalar(16);
2974   LLT S32 = LLT::scalar(32);
2975   LLT S64 = LLT::scalar(64);
2976 
2977   if (DstTy == S16)
2978     return legalizeFDIV16(MI, MRI, B);
2979   if (DstTy == S32)
2980     return legalizeFDIV32(MI, MRI, B);
2981   if (DstTy == S64)
2982     return legalizeFDIV64(MI, MRI, B);
2983 
2984   return false;
2985 }
2986 
2987 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2988                                                         Register DstDivReg,
2989                                                         Register DstRemReg,
2990                                                         Register X,
2991                                                         Register Y) const {
2992   const LLT S1 = LLT::scalar(1);
2993   const LLT S32 = LLT::scalar(32);
2994 
2995   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2996   // algorithm used here.
2997 
2998   // Initial estimate of inv(y).
2999   auto FloatY = B.buildUITOFP(S32, Y);
3000   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3001   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3002   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3003   auto Z = B.buildFPTOUI(S32, ScaledY);
3004 
3005   // One round of UNR.
3006   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3007   auto NegYZ = B.buildMul(S32, NegY, Z);
3008   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3009 
3010   // Quotient/remainder estimate.
3011   auto Q = B.buildUMulH(S32, X, Z);
3012   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3013 
3014   // First quotient/remainder refinement.
3015   auto One = B.buildConstant(S32, 1);
3016   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3017   if (DstDivReg)
3018     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3019   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3020 
3021   // Second quotient/remainder refinement.
3022   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3023   if (DstDivReg)
3024     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3025 
3026   if (DstRemReg)
3027     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3028 }
3029 
3030 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3031 //
3032 // Return lo, hi of result
3033 //
3034 // %cvt.lo = G_UITOFP Val.lo
3035 // %cvt.hi = G_UITOFP Val.hi
3036 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3037 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3038 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3039 // %mul2 = G_FMUL %mul1, 2**(-32)
3040 // %trunc = G_INTRINSIC_TRUNC %mul2
3041 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3042 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3043 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3044                                                        Register Val) {
3045   const LLT S32 = LLT::scalar(32);
3046   auto Unmerge = B.buildUnmerge(S32, Val);
3047 
3048   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3049   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3050 
3051   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3052                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3053 
3054   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3055   auto Mul1 =
3056       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3057 
3058   // 2**(-32)
3059   auto Mul2 =
3060       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3061   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3062 
3063   // -(2**32)
3064   auto Mad2 = B.buildFMAD(S32, Trunc,
3065                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3066 
3067   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3068   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3069 
3070   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3071 }
3072 
3073 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3074                                                         Register DstDivReg,
3075                                                         Register DstRemReg,
3076                                                         Register Numer,
3077                                                         Register Denom) const {
3078   const LLT S32 = LLT::scalar(32);
3079   const LLT S64 = LLT::scalar(64);
3080   const LLT S1 = LLT::scalar(1);
3081   Register RcpLo, RcpHi;
3082 
3083   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3084 
3085   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3086 
3087   auto Zero64 = B.buildConstant(S64, 0);
3088   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3089 
3090   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3091   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3092 
3093   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3094   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3095   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3096 
3097   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3098   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3099   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3100 
3101   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3102   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3103   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3104   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3105   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3106 
3107   auto Zero32 = B.buildConstant(S32, 0);
3108   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3109   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3110   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3111 
3112   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3113   Register NumerLo = UnmergeNumer.getReg(0);
3114   Register NumerHi = UnmergeNumer.getReg(1);
3115 
3116   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3117   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3118   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3119   Register Mul3_Lo = UnmergeMul3.getReg(0);
3120   Register Mul3_Hi = UnmergeMul3.getReg(1);
3121   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3122   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3123   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3124   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3125 
3126   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3127   Register DenomLo = UnmergeDenom.getReg(0);
3128   Register DenomHi = UnmergeDenom.getReg(1);
3129 
3130   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3131   auto C1 = B.buildSExt(S32, CmpHi);
3132 
3133   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3134   auto C2 = B.buildSExt(S32, CmpLo);
3135 
3136   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3137   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3138 
3139   // TODO: Here and below portions of the code can be enclosed into if/endif.
3140   // Currently control flow is unconditional and we have 4 selects after
3141   // potential endif to substitute PHIs.
3142 
3143   // if C3 != 0 ...
3144   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3145   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3146   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3147   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3148 
3149   auto One64 = B.buildConstant(S64, 1);
3150   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3151 
3152   auto C4 =
3153       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3154   auto C5 =
3155       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3156   auto C6 = B.buildSelect(
3157       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3158 
3159   // if (C6 != 0)
3160   auto Add4 = B.buildAdd(S64, Add3, One64);
3161   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3162 
3163   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3164   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3165   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3166 
3167   // endif C6
3168   // endif C3
3169 
3170   if (DstDivReg) {
3171     auto Sel1 = B.buildSelect(
3172         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3173     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3174                   Sel1, MulHi3);
3175   }
3176 
3177   if (DstRemReg) {
3178     auto Sel2 = B.buildSelect(
3179         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3180     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3181                   Sel2, Sub1);
3182   }
3183 }
3184 
3185 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3186                                                   MachineRegisterInfo &MRI,
3187                                                   MachineIRBuilder &B) const {
3188   Register DstDivReg, DstRemReg;
3189   switch (MI.getOpcode()) {
3190   default:
3191     llvm_unreachable("Unexpected opcode!");
3192   case AMDGPU::G_UDIV: {
3193     DstDivReg = MI.getOperand(0).getReg();
3194     break;
3195   }
3196   case AMDGPU::G_UREM: {
3197     DstRemReg = MI.getOperand(0).getReg();
3198     break;
3199   }
3200   case AMDGPU::G_UDIVREM: {
3201     DstDivReg = MI.getOperand(0).getReg();
3202     DstRemReg = MI.getOperand(1).getReg();
3203     break;
3204   }
3205   }
3206 
3207   const LLT S64 = LLT::scalar(64);
3208   const LLT S32 = LLT::scalar(32);
3209   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3210   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3211   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3212   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3213 
3214   if (Ty == S32)
3215     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3216   else if (Ty == S64)
3217     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3218   else
3219     return false;
3220 
3221   MI.eraseFromParent();
3222   return true;
3223 }
3224 
3225 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3226                                                 MachineRegisterInfo &MRI,
3227                                                 MachineIRBuilder &B) const {
3228   const LLT S64 = LLT::scalar(64);
3229   const LLT S32 = LLT::scalar(32);
3230 
3231   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3232   if (Ty != S32 && Ty != S64)
3233     return false;
3234 
3235   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3236   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3237   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3238 
3239   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3240   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3241   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3242 
3243   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3244   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3245 
3246   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3247   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3248 
3249   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3250   switch (MI.getOpcode()) {
3251   default:
3252     llvm_unreachable("Unexpected opcode!");
3253   case AMDGPU::G_SDIV: {
3254     DstDivReg = MI.getOperand(0).getReg();
3255     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3256     break;
3257   }
3258   case AMDGPU::G_SREM: {
3259     DstRemReg = MI.getOperand(0).getReg();
3260     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3261     break;
3262   }
3263   case AMDGPU::G_SDIVREM: {
3264     DstDivReg = MI.getOperand(0).getReg();
3265     DstRemReg = MI.getOperand(1).getReg();
3266     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3267     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3268     break;
3269   }
3270   }
3271 
3272   if (Ty == S32)
3273     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3274   else
3275     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3276 
3277   if (DstDivReg) {
3278     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3279     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3280     B.buildSub(DstDivReg, SignXor, Sign);
3281   }
3282 
3283   if (DstRemReg) {
3284     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3285     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3286     B.buildSub(DstRemReg, SignXor, Sign);
3287   }
3288 
3289   MI.eraseFromParent();
3290   return true;
3291 }
3292 
3293 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3294                                                  MachineRegisterInfo &MRI,
3295                                                  MachineIRBuilder &B) const {
3296   Register Res = MI.getOperand(0).getReg();
3297   Register LHS = MI.getOperand(1).getReg();
3298   Register RHS = MI.getOperand(2).getReg();
3299   uint16_t Flags = MI.getFlags();
3300   LLT ResTy = MRI.getType(Res);
3301 
3302   const MachineFunction &MF = B.getMF();
3303   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3304                             MI.getFlag(MachineInstr::FmAfn);
3305 
3306   if (!AllowInaccurateRcp)
3307     return false;
3308 
3309   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3310     // 1 / x -> RCP(x)
3311     if (CLHS->isExactlyValue(1.0)) {
3312       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3313         .addUse(RHS)
3314         .setMIFlags(Flags);
3315 
3316       MI.eraseFromParent();
3317       return true;
3318     }
3319 
3320     // -1 / x -> RCP( FNEG(x) )
3321     if (CLHS->isExactlyValue(-1.0)) {
3322       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3323       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3324         .addUse(FNeg.getReg(0))
3325         .setMIFlags(Flags);
3326 
3327       MI.eraseFromParent();
3328       return true;
3329     }
3330   }
3331 
3332   // x / y -> x * (1.0 / y)
3333   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3334     .addUse(RHS)
3335     .setMIFlags(Flags);
3336   B.buildFMul(Res, LHS, RCP, Flags);
3337 
3338   MI.eraseFromParent();
3339   return true;
3340 }
3341 
3342 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3343                                                    MachineRegisterInfo &MRI,
3344                                                    MachineIRBuilder &B) const {
3345   Register Res = MI.getOperand(0).getReg();
3346   Register X = MI.getOperand(1).getReg();
3347   Register Y = MI.getOperand(2).getReg();
3348   uint16_t Flags = MI.getFlags();
3349   LLT ResTy = MRI.getType(Res);
3350 
3351   const MachineFunction &MF = B.getMF();
3352   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3353                             MI.getFlag(MachineInstr::FmAfn);
3354 
3355   if (!AllowInaccurateRcp)
3356     return false;
3357 
3358   auto NegY = B.buildFNeg(ResTy, Y);
3359   auto One = B.buildFConstant(ResTy, 1.0);
3360 
3361   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3362     .addUse(Y)
3363     .setMIFlags(Flags);
3364 
3365   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3366   R = B.buildFMA(ResTy, Tmp0, R, R);
3367 
3368   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3369   R = B.buildFMA(ResTy, Tmp1, R, R);
3370 
3371   auto Ret = B.buildFMul(ResTy, X, R);
3372   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3373 
3374   B.buildFMA(Res, Tmp2, R, Ret);
3375   MI.eraseFromParent();
3376   return true;
3377 }
3378 
3379 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3380                                          MachineRegisterInfo &MRI,
3381                                          MachineIRBuilder &B) const {
3382   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3383     return true;
3384 
3385   Register Res = MI.getOperand(0).getReg();
3386   Register LHS = MI.getOperand(1).getReg();
3387   Register RHS = MI.getOperand(2).getReg();
3388 
3389   uint16_t Flags = MI.getFlags();
3390 
3391   LLT S16 = LLT::scalar(16);
3392   LLT S32 = LLT::scalar(32);
3393 
3394   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3395   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3396 
3397   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3398     .addUse(RHSExt.getReg(0))
3399     .setMIFlags(Flags);
3400 
3401   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3402   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3403 
3404   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3405     .addUse(RDst.getReg(0))
3406     .addUse(RHS)
3407     .addUse(LHS)
3408     .setMIFlags(Flags);
3409 
3410   MI.eraseFromParent();
3411   return true;
3412 }
3413 
3414 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3415 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3416 static void toggleSPDenormMode(bool Enable,
3417                                MachineIRBuilder &B,
3418                                const GCNSubtarget &ST,
3419                                AMDGPU::SIModeRegisterDefaults Mode) {
3420   // Set SP denorm mode to this value.
3421   unsigned SPDenormMode =
3422     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3423 
3424   if (ST.hasDenormModeInst()) {
3425     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3426     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3427 
3428     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3429     B.buildInstr(AMDGPU::S_DENORM_MODE)
3430       .addImm(NewDenormModeValue);
3431 
3432   } else {
3433     // Select FP32 bit field in mode register.
3434     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3435                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3436                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3437 
3438     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3439       .addImm(SPDenormMode)
3440       .addImm(SPDenormModeBitField);
3441   }
3442 }
3443 
3444 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3445                                          MachineRegisterInfo &MRI,
3446                                          MachineIRBuilder &B) const {
3447   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3448     return true;
3449 
3450   Register Res = MI.getOperand(0).getReg();
3451   Register LHS = MI.getOperand(1).getReg();
3452   Register RHS = MI.getOperand(2).getReg();
3453   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3454   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3455 
3456   uint16_t Flags = MI.getFlags();
3457 
3458   LLT S32 = LLT::scalar(32);
3459   LLT S1 = LLT::scalar(1);
3460 
3461   auto One = B.buildFConstant(S32, 1.0f);
3462 
3463   auto DenominatorScaled =
3464     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3465       .addUse(LHS)
3466       .addUse(RHS)
3467       .addImm(0)
3468       .setMIFlags(Flags);
3469   auto NumeratorScaled =
3470     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3471       .addUse(LHS)
3472       .addUse(RHS)
3473       .addImm(1)
3474       .setMIFlags(Flags);
3475 
3476   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3477     .addUse(DenominatorScaled.getReg(0))
3478     .setMIFlags(Flags);
3479   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3480 
3481   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3482   // aren't modeled as reading it.
3483   if (!Mode.allFP32Denormals())
3484     toggleSPDenormMode(true, B, ST, Mode);
3485 
3486   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3487   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3488   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3489   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3490   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3491   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3492 
3493   if (!Mode.allFP32Denormals())
3494     toggleSPDenormMode(false, B, ST, Mode);
3495 
3496   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3497     .addUse(Fma4.getReg(0))
3498     .addUse(Fma1.getReg(0))
3499     .addUse(Fma3.getReg(0))
3500     .addUse(NumeratorScaled.getReg(1))
3501     .setMIFlags(Flags);
3502 
3503   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3504     .addUse(Fmas.getReg(0))
3505     .addUse(RHS)
3506     .addUse(LHS)
3507     .setMIFlags(Flags);
3508 
3509   MI.eraseFromParent();
3510   return true;
3511 }
3512 
3513 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3514                                          MachineRegisterInfo &MRI,
3515                                          MachineIRBuilder &B) const {
3516   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3517     return true;
3518 
3519   Register Res = MI.getOperand(0).getReg();
3520   Register LHS = MI.getOperand(1).getReg();
3521   Register RHS = MI.getOperand(2).getReg();
3522 
3523   uint16_t Flags = MI.getFlags();
3524 
3525   LLT S64 = LLT::scalar(64);
3526   LLT S1 = LLT::scalar(1);
3527 
3528   auto One = B.buildFConstant(S64, 1.0);
3529 
3530   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3531     .addUse(LHS)
3532     .addUse(RHS)
3533     .addImm(0)
3534     .setMIFlags(Flags);
3535 
3536   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3537 
3538   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3539     .addUse(DivScale0.getReg(0))
3540     .setMIFlags(Flags);
3541 
3542   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3543   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3544   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3545 
3546   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3547     .addUse(LHS)
3548     .addUse(RHS)
3549     .addImm(1)
3550     .setMIFlags(Flags);
3551 
3552   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3553   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3554   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3555 
3556   Register Scale;
3557   if (!ST.hasUsableDivScaleConditionOutput()) {
3558     // Workaround a hardware bug on SI where the condition output from div_scale
3559     // is not usable.
3560 
3561     LLT S32 = LLT::scalar(32);
3562 
3563     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3564     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3565     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3566     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3567 
3568     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3569                               Scale1Unmerge.getReg(1));
3570     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3571                               Scale0Unmerge.getReg(1));
3572     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3573   } else {
3574     Scale = DivScale1.getReg(1);
3575   }
3576 
3577   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3578     .addUse(Fma4.getReg(0))
3579     .addUse(Fma3.getReg(0))
3580     .addUse(Mul.getReg(0))
3581     .addUse(Scale)
3582     .setMIFlags(Flags);
3583 
3584   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3585     .addUse(Fmas.getReg(0))
3586     .addUse(RHS)
3587     .addUse(LHS)
3588     .setMIFlags(Flags);
3589 
3590   MI.eraseFromParent();
3591   return true;
3592 }
3593 
3594 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3595                                                  MachineRegisterInfo &MRI,
3596                                                  MachineIRBuilder &B) const {
3597   Register Res = MI.getOperand(0).getReg();
3598   Register LHS = MI.getOperand(2).getReg();
3599   Register RHS = MI.getOperand(3).getReg();
3600   uint16_t Flags = MI.getFlags();
3601 
3602   LLT S32 = LLT::scalar(32);
3603   LLT S1 = LLT::scalar(1);
3604 
3605   auto Abs = B.buildFAbs(S32, RHS, Flags);
3606   const APFloat C0Val(1.0f);
3607 
3608   auto C0 = B.buildConstant(S32, 0x6f800000);
3609   auto C1 = B.buildConstant(S32, 0x2f800000);
3610   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3611 
3612   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3613   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3614 
3615   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3616 
3617   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3618     .addUse(Mul0.getReg(0))
3619     .setMIFlags(Flags);
3620 
3621   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3622 
3623   B.buildFMul(Res, Sel, Mul1, Flags);
3624 
3625   MI.eraseFromParent();
3626   return true;
3627 }
3628 
3629 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3630 // FIXME: Why do we handle this one but not other removed instructions?
3631 //
3632 // Reciprocal square root.  The clamp prevents infinite results, clamping
3633 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3634 // +-max_float.
3635 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3636                                                     MachineRegisterInfo &MRI,
3637                                                     MachineIRBuilder &B) const {
3638   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3639     return true;
3640 
3641   Register Dst = MI.getOperand(0).getReg();
3642   Register Src = MI.getOperand(2).getReg();
3643   auto Flags = MI.getFlags();
3644 
3645   LLT Ty = MRI.getType(Dst);
3646 
3647   const fltSemantics *FltSemantics;
3648   if (Ty == LLT::scalar(32))
3649     FltSemantics = &APFloat::IEEEsingle();
3650   else if (Ty == LLT::scalar(64))
3651     FltSemantics = &APFloat::IEEEdouble();
3652   else
3653     return false;
3654 
3655   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3656     .addUse(Src)
3657     .setMIFlags(Flags);
3658 
3659   // We don't need to concern ourselves with the snan handling difference, since
3660   // the rsq quieted (or not) so use the one which will directly select.
3661   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3662   const bool UseIEEE = MFI->getMode().IEEE;
3663 
3664   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3665   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3666                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3667 
3668   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3669 
3670   if (UseIEEE)
3671     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3672   else
3673     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3674   MI.eraseFromParent();
3675   return true;
3676 }
3677 
3678 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3679   switch (IID) {
3680   case Intrinsic::amdgcn_ds_fadd:
3681     return AMDGPU::G_ATOMICRMW_FADD;
3682   case Intrinsic::amdgcn_ds_fmin:
3683     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3684   case Intrinsic::amdgcn_ds_fmax:
3685     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3686   default:
3687     llvm_unreachable("not a DS FP intrinsic");
3688   }
3689 }
3690 
3691 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3692                                                       MachineInstr &MI,
3693                                                       Intrinsic::ID IID) const {
3694   GISelChangeObserver &Observer = Helper.Observer;
3695   Observer.changingInstr(MI);
3696 
3697   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3698 
3699   // The remaining operands were used to set fields in the MemOperand on
3700   // construction.
3701   for (int I = 6; I > 3; --I)
3702     MI.RemoveOperand(I);
3703 
3704   MI.RemoveOperand(1); // Remove the intrinsic ID.
3705   Observer.changedInstr(MI);
3706   return true;
3707 }
3708 
3709 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3710                                             MachineRegisterInfo &MRI,
3711                                             MachineIRBuilder &B) const {
3712   uint64_t Offset =
3713     ST.getTargetLowering()->getImplicitParameterOffset(
3714       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3715   LLT DstTy = MRI.getType(DstReg);
3716   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3717 
3718   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3719   if (!loadInputValue(KernargPtrReg, B,
3720                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3721     return false;
3722 
3723   // FIXME: This should be nuw
3724   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3725   return true;
3726 }
3727 
3728 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3729                                                  MachineRegisterInfo &MRI,
3730                                                  MachineIRBuilder &B) const {
3731   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3732   if (!MFI->isEntryFunction()) {
3733     return legalizePreloadedArgIntrin(MI, MRI, B,
3734                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3735   }
3736 
3737   Register DstReg = MI.getOperand(0).getReg();
3738   if (!getImplicitArgPtr(DstReg, MRI, B))
3739     return false;
3740 
3741   MI.eraseFromParent();
3742   return true;
3743 }
3744 
3745 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3746                                               MachineRegisterInfo &MRI,
3747                                               MachineIRBuilder &B,
3748                                               unsigned AddrSpace) const {
3749   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3750   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3751   Register Hi32 = Unmerge.getReg(1);
3752 
3753   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3754   MI.eraseFromParent();
3755   return true;
3756 }
3757 
3758 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3759 // offset (the offset that is included in bounds checking and swizzling, to be
3760 // split between the instruction's voffset and immoffset fields) and soffset
3761 // (the offset that is excluded from bounds checking and swizzling, to go in
3762 // the instruction's soffset field).  This function takes the first kind of
3763 // offset and figures out how to split it between voffset and immoffset.
3764 std::pair<Register, unsigned>
3765 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3766                                         Register OrigOffset) const {
3767   const unsigned MaxImm = 4095;
3768   Register BaseReg;
3769   unsigned ImmOffset;
3770   const LLT S32 = LLT::scalar(32);
3771   MachineRegisterInfo &MRI = *B.getMRI();
3772 
3773   std::tie(BaseReg, ImmOffset) =
3774       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3775 
3776   // If BaseReg is a pointer, convert it to int.
3777   if (MRI.getType(BaseReg).isPointer())
3778     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3779 
3780   // If the immediate value is too big for the immoffset field, put the value
3781   // and -4096 into the immoffset field so that the value that is copied/added
3782   // for the voffset field is a multiple of 4096, and it stands more chance
3783   // of being CSEd with the copy/add for another similar load/store.
3784   // However, do not do that rounding down to a multiple of 4096 if that is a
3785   // negative number, as it appears to be illegal to have a negative offset
3786   // in the vgpr, even if adding the immediate offset makes it positive.
3787   unsigned Overflow = ImmOffset & ~MaxImm;
3788   ImmOffset -= Overflow;
3789   if ((int32_t)Overflow < 0) {
3790     Overflow += ImmOffset;
3791     ImmOffset = 0;
3792   }
3793 
3794   if (Overflow != 0) {
3795     if (!BaseReg) {
3796       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3797     } else {
3798       auto OverflowVal = B.buildConstant(S32, Overflow);
3799       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3800     }
3801   }
3802 
3803   if (!BaseReg)
3804     BaseReg = B.buildConstant(S32, 0).getReg(0);
3805 
3806   return std::make_pair(BaseReg, ImmOffset);
3807 }
3808 
3809 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3810 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3811                                           Register VOffset, Register SOffset,
3812                                           unsigned ImmOffset, Register VIndex,
3813                                           MachineRegisterInfo &MRI) const {
3814   Optional<ValueAndVReg> MaybeVOffsetVal =
3815       getIConstantVRegValWithLookThrough(VOffset, MRI);
3816   Optional<ValueAndVReg> MaybeSOffsetVal =
3817       getIConstantVRegValWithLookThrough(SOffset, MRI);
3818   Optional<ValueAndVReg> MaybeVIndexVal =
3819       getIConstantVRegValWithLookThrough(VIndex, MRI);
3820   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3821   // update the MMO with that offset. The stride is unknown so we can only do
3822   // this if VIndex is constant 0.
3823   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3824       MaybeVIndexVal->Value == 0) {
3825     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3826                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3827     MMO->setOffset(TotalOffset);
3828   } else {
3829     // We don't have a constant combined offset to use in the MMO. Give up.
3830     MMO->setValue((Value *)nullptr);
3831   }
3832 }
3833 
3834 /// Handle register layout difference for f16 images for some subtargets.
3835 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3836                                              MachineRegisterInfo &MRI,
3837                                              Register Reg,
3838                                              bool ImageStore) const {
3839   const LLT S16 = LLT::scalar(16);
3840   const LLT S32 = LLT::scalar(32);
3841   LLT StoreVT = MRI.getType(Reg);
3842   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3843 
3844   if (ST.hasUnpackedD16VMem()) {
3845     auto Unmerge = B.buildUnmerge(S16, Reg);
3846 
3847     SmallVector<Register, 4> WideRegs;
3848     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3849       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3850 
3851     int NumElts = StoreVT.getNumElements();
3852 
3853     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3854         .getReg(0);
3855   }
3856 
3857   if (ImageStore && ST.hasImageStoreD16Bug()) {
3858     if (StoreVT.getNumElements() == 2) {
3859       SmallVector<Register, 4> PackedRegs;
3860       Reg = B.buildBitcast(S32, Reg).getReg(0);
3861       PackedRegs.push_back(Reg);
3862       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3863       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3864           .getReg(0);
3865     }
3866 
3867     if (StoreVT.getNumElements() == 3) {
3868       SmallVector<Register, 4> PackedRegs;
3869       auto Unmerge = B.buildUnmerge(S16, Reg);
3870       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3871         PackedRegs.push_back(Unmerge.getReg(I));
3872       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3873       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3874       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3875     }
3876 
3877     if (StoreVT.getNumElements() == 4) {
3878       SmallVector<Register, 4> PackedRegs;
3879       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3880       auto Unmerge = B.buildUnmerge(S32, Reg);
3881       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3882         PackedRegs.push_back(Unmerge.getReg(I));
3883       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3884       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3885           .getReg(0);
3886     }
3887 
3888     llvm_unreachable("invalid data type");
3889   }
3890 
3891   if (StoreVT == LLT::fixed_vector(3, S16)) {
3892     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3893               .getReg(0);
3894   }
3895   return Reg;
3896 }
3897 
3898 Register AMDGPULegalizerInfo::fixStoreSourceType(
3899   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3900   MachineRegisterInfo *MRI = B.getMRI();
3901   LLT Ty = MRI->getType(VData);
3902 
3903   const LLT S16 = LLT::scalar(16);
3904 
3905   // Fixup illegal register types for i8 stores.
3906   if (Ty == LLT::scalar(8) || Ty == S16) {
3907     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3908     return AnyExt;
3909   }
3910 
3911   if (Ty.isVector()) {
3912     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3913       if (IsFormat)
3914         return handleD16VData(B, *MRI, VData);
3915     }
3916   }
3917 
3918   return VData;
3919 }
3920 
3921 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3922                                               MachineRegisterInfo &MRI,
3923                                               MachineIRBuilder &B,
3924                                               bool IsTyped,
3925                                               bool IsFormat) const {
3926   Register VData = MI.getOperand(1).getReg();
3927   LLT Ty = MRI.getType(VData);
3928   LLT EltTy = Ty.getScalarType();
3929   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3930   const LLT S32 = LLT::scalar(32);
3931 
3932   VData = fixStoreSourceType(B, VData, IsFormat);
3933   Register RSrc = MI.getOperand(2).getReg();
3934 
3935   MachineMemOperand *MMO = *MI.memoperands_begin();
3936   const int MemSize = MMO->getSize();
3937 
3938   unsigned ImmOffset;
3939 
3940   // The typed intrinsics add an immediate after the registers.
3941   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3942 
3943   // The struct intrinsic variants add one additional operand over raw.
3944   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3945   Register VIndex;
3946   int OpOffset = 0;
3947   if (HasVIndex) {
3948     VIndex = MI.getOperand(3).getReg();
3949     OpOffset = 1;
3950   } else {
3951     VIndex = B.buildConstant(S32, 0).getReg(0);
3952   }
3953 
3954   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3955   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3956 
3957   unsigned Format = 0;
3958   if (IsTyped) {
3959     Format = MI.getOperand(5 + OpOffset).getImm();
3960     ++OpOffset;
3961   }
3962 
3963   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3964 
3965   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3966   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3967 
3968   unsigned Opc;
3969   if (IsTyped) {
3970     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3971                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3972   } else if (IsFormat) {
3973     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3974                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3975   } else {
3976     switch (MemSize) {
3977     case 1:
3978       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3979       break;
3980     case 2:
3981       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3982       break;
3983     default:
3984       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3985       break;
3986     }
3987   }
3988 
3989   auto MIB = B.buildInstr(Opc)
3990     .addUse(VData)              // vdata
3991     .addUse(RSrc)               // rsrc
3992     .addUse(VIndex)             // vindex
3993     .addUse(VOffset)            // voffset
3994     .addUse(SOffset)            // soffset
3995     .addImm(ImmOffset);         // offset(imm)
3996 
3997   if (IsTyped)
3998     MIB.addImm(Format);
3999 
4000   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4001      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4002      .addMemOperand(MMO);
4003 
4004   MI.eraseFromParent();
4005   return true;
4006 }
4007 
4008 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4009                                              MachineRegisterInfo &MRI,
4010                                              MachineIRBuilder &B,
4011                                              bool IsFormat,
4012                                              bool IsTyped) const {
4013   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4014   MachineMemOperand *MMO = *MI.memoperands_begin();
4015   const LLT MemTy = MMO->getMemoryType();
4016   const LLT S32 = LLT::scalar(32);
4017 
4018   Register Dst = MI.getOperand(0).getReg();
4019   Register RSrc = MI.getOperand(2).getReg();
4020 
4021   // The typed intrinsics add an immediate after the registers.
4022   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4023 
4024   // The struct intrinsic variants add one additional operand over raw.
4025   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4026   Register VIndex;
4027   int OpOffset = 0;
4028   if (HasVIndex) {
4029     VIndex = MI.getOperand(3).getReg();
4030     OpOffset = 1;
4031   } else {
4032     VIndex = B.buildConstant(S32, 0).getReg(0);
4033   }
4034 
4035   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4036   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4037 
4038   unsigned Format = 0;
4039   if (IsTyped) {
4040     Format = MI.getOperand(5 + OpOffset).getImm();
4041     ++OpOffset;
4042   }
4043 
4044   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4045   unsigned ImmOffset;
4046 
4047   LLT Ty = MRI.getType(Dst);
4048   LLT EltTy = Ty.getScalarType();
4049   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4050   const bool Unpacked = ST.hasUnpackedD16VMem();
4051 
4052   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4053   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4054 
4055   unsigned Opc;
4056 
4057   if (IsTyped) {
4058     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4059                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4060   } else if (IsFormat) {
4061     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4062                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4063   } else {
4064     switch (MemTy.getSizeInBits()) {
4065     case 8:
4066       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4067       break;
4068     case 16:
4069       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4070       break;
4071     default:
4072       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4073       break;
4074     }
4075   }
4076 
4077   Register LoadDstReg;
4078 
4079   bool IsExtLoad =
4080       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4081   LLT UnpackedTy = Ty.changeElementSize(32);
4082 
4083   if (IsExtLoad)
4084     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4085   else if (Unpacked && IsD16 && Ty.isVector())
4086     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4087   else
4088     LoadDstReg = Dst;
4089 
4090   auto MIB = B.buildInstr(Opc)
4091     .addDef(LoadDstReg)         // vdata
4092     .addUse(RSrc)               // rsrc
4093     .addUse(VIndex)             // vindex
4094     .addUse(VOffset)            // voffset
4095     .addUse(SOffset)            // soffset
4096     .addImm(ImmOffset);         // offset(imm)
4097 
4098   if (IsTyped)
4099     MIB.addImm(Format);
4100 
4101   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4102      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4103      .addMemOperand(MMO);
4104 
4105   if (LoadDstReg != Dst) {
4106     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4107 
4108     // Widen result for extending loads was widened.
4109     if (IsExtLoad)
4110       B.buildTrunc(Dst, LoadDstReg);
4111     else {
4112       // Repack to original 16-bit vector result
4113       // FIXME: G_TRUNC should work, but legalization currently fails
4114       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4115       SmallVector<Register, 4> Repack;
4116       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4117         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4118       B.buildMerge(Dst, Repack);
4119     }
4120   }
4121 
4122   MI.eraseFromParent();
4123   return true;
4124 }
4125 
4126 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4127                                                MachineIRBuilder &B,
4128                                                bool IsInc) const {
4129   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4130                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4131   B.buildInstr(Opc)
4132     .addDef(MI.getOperand(0).getReg())
4133     .addUse(MI.getOperand(2).getReg())
4134     .addUse(MI.getOperand(3).getReg())
4135     .cloneMemRefs(MI);
4136   MI.eraseFromParent();
4137   return true;
4138 }
4139 
4140 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4141   switch (IntrID) {
4142   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4143   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4144     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4145   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4146   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4147     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4148   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4149   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4150     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4151   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4152   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4153     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4154   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4155   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4156     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4157   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4158   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4159     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4160   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4161   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4162     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4163   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4164   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4165     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4166   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4167   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4168     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4169   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4170   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4171     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4172   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4173   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4174     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4175   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4176   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4177     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4178   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4179   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4180     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4181   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4182   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4183     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4184   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4185   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4186     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4187   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4188   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4189     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4190   default:
4191     llvm_unreachable("unhandled atomic opcode");
4192   }
4193 }
4194 
4195 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4196                                                MachineIRBuilder &B,
4197                                                Intrinsic::ID IID) const {
4198   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4199                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4200   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4201 
4202   Register Dst;
4203 
4204   int OpOffset = 0;
4205   if (HasReturn) {
4206     // A few FP atomics do not support return values.
4207     Dst = MI.getOperand(0).getReg();
4208   } else {
4209     OpOffset = -1;
4210   }
4211 
4212   Register VData = MI.getOperand(2 + OpOffset).getReg();
4213   Register CmpVal;
4214 
4215   if (IsCmpSwap) {
4216     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4217     ++OpOffset;
4218   }
4219 
4220   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4221   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4222 
4223   // The struct intrinsic variants add one additional operand over raw.
4224   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4225   Register VIndex;
4226   if (HasVIndex) {
4227     VIndex = MI.getOperand(4 + OpOffset).getReg();
4228     ++OpOffset;
4229   } else {
4230     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4231   }
4232 
4233   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4234   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4235   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4236 
4237   MachineMemOperand *MMO = *MI.memoperands_begin();
4238 
4239   unsigned ImmOffset;
4240   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4241   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4242 
4243   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4244 
4245   if (HasReturn)
4246     MIB.addDef(Dst);
4247 
4248   MIB.addUse(VData); // vdata
4249 
4250   if (IsCmpSwap)
4251     MIB.addReg(CmpVal);
4252 
4253   MIB.addUse(RSrc)               // rsrc
4254      .addUse(VIndex)             // vindex
4255      .addUse(VOffset)            // voffset
4256      .addUse(SOffset)            // soffset
4257      .addImm(ImmOffset)          // offset(imm)
4258      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4259      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4260      .addMemOperand(MMO);
4261 
4262   MI.eraseFromParent();
4263   return true;
4264 }
4265 
4266 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4267 /// vector with s16 typed elements.
4268 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4269                                       SmallVectorImpl<Register> &PackedAddrs,
4270                                       unsigned ArgOffset,
4271                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4272                                       bool IsA16, bool IsG16) {
4273   const LLT S16 = LLT::scalar(16);
4274   const LLT V2S16 = LLT::fixed_vector(2, 16);
4275   auto EndIdx = Intr->VAddrEnd;
4276 
4277   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4278     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4279     if (!SrcOp.isReg())
4280       continue; // _L to _LZ may have eliminated this.
4281 
4282     Register AddrReg = SrcOp.getReg();
4283 
4284     if ((I < Intr->GradientStart) ||
4285         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4286         (I >= Intr->CoordStart && !IsA16)) {
4287       if ((I < Intr->GradientStart) && IsA16 &&
4288           (B.getMRI()->getType(AddrReg) == S16)) {
4289         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4290         // Special handling of bias when A16 is on. Bias is of type half but
4291         // occupies full 32-bit.
4292         PackedAddrs.push_back(
4293             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4294                 .getReg(0));
4295       } else {
4296         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4297                "Bias needs to be converted to 16 bit in A16 mode");
4298         // Handle any gradient or coordinate operands that should not be packed
4299         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4300         PackedAddrs.push_back(AddrReg);
4301       }
4302     } else {
4303       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4304       // derivatives dx/dh and dx/dv are packed with undef.
4305       if (((I + 1) >= EndIdx) ||
4306           ((Intr->NumGradients / 2) % 2 == 1 &&
4307            (I == static_cast<unsigned>(Intr->GradientStart +
4308                                        (Intr->NumGradients / 2) - 1) ||
4309             I == static_cast<unsigned>(Intr->GradientStart +
4310                                        Intr->NumGradients - 1))) ||
4311           // Check for _L to _LZ optimization
4312           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4313         PackedAddrs.push_back(
4314             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4315                 .getReg(0));
4316       } else {
4317         PackedAddrs.push_back(
4318             B.buildBuildVector(
4319                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4320                 .getReg(0));
4321         ++I;
4322       }
4323     }
4324   }
4325 }
4326 
4327 /// Convert from separate vaddr components to a single vector address register,
4328 /// and replace the remaining operands with $noreg.
4329 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4330                                      int DimIdx, int NumVAddrs) {
4331   const LLT S32 = LLT::scalar(32);
4332 
4333   SmallVector<Register, 8> AddrRegs;
4334   for (int I = 0; I != NumVAddrs; ++I) {
4335     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4336     if (SrcOp.isReg()) {
4337       AddrRegs.push_back(SrcOp.getReg());
4338       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4339     }
4340   }
4341 
4342   int NumAddrRegs = AddrRegs.size();
4343   if (NumAddrRegs != 1) {
4344     // Above 8 elements round up to next power of 2 (i.e. 16).
4345     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4346       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4347       auto Undef = B.buildUndef(S32);
4348       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4349       NumAddrRegs = RoundedNumRegs;
4350     }
4351 
4352     auto VAddr =
4353         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4354     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4355   }
4356 
4357   for (int I = 1; I != NumVAddrs; ++I) {
4358     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4359     if (SrcOp.isReg())
4360       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4361   }
4362 }
4363 
4364 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4365 ///
4366 /// Depending on the subtarget, load/store with 16-bit element data need to be
4367 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4368 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4369 /// registers.
4370 ///
4371 /// We don't want to directly select image instructions just yet, but also want
4372 /// to exposes all register repacking to the legalizer/combiners. We also don't
4373 /// want a selected instrution entering RegBankSelect. In order to avoid
4374 /// defining a multitude of intermediate image instructions, directly hack on
4375 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4376 /// padding now unnecessary arguments with $noreg.
4377 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4378     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4379     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4380 
4381   const unsigned NumDefs = MI.getNumExplicitDefs();
4382   const unsigned ArgOffset = NumDefs + 1;
4383   bool IsTFE = NumDefs == 2;
4384   // We are only processing the operands of d16 image operations on subtargets
4385   // that use the unpacked register layout, or need to repack the TFE result.
4386 
4387   // TODO: Do we need to guard against already legalized intrinsics?
4388   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4389       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4390 
4391   MachineRegisterInfo *MRI = B.getMRI();
4392   const LLT S32 = LLT::scalar(32);
4393   const LLT S16 = LLT::scalar(16);
4394   const LLT V2S16 = LLT::fixed_vector(2, 16);
4395 
4396   unsigned DMask = 0;
4397   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4398   LLT Ty = MRI->getType(VData);
4399 
4400   // Check for 16 bit addresses and pack if true.
4401   LLT GradTy =
4402       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4403   LLT AddrTy =
4404       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4405   const bool IsG16 = GradTy == S16;
4406   const bool IsA16 = AddrTy == S16;
4407   const bool IsD16 = Ty.getScalarType() == S16;
4408 
4409   int DMaskLanes = 0;
4410   if (!BaseOpcode->Atomic) {
4411     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4412     if (BaseOpcode->Gather4) {
4413       DMaskLanes = 4;
4414     } else if (DMask != 0) {
4415       DMaskLanes = countPopulation(DMask);
4416     } else if (!IsTFE && !BaseOpcode->Store) {
4417       // If dmask is 0, this is a no-op load. This can be eliminated.
4418       B.buildUndef(MI.getOperand(0));
4419       MI.eraseFromParent();
4420       return true;
4421     }
4422   }
4423 
4424   Observer.changingInstr(MI);
4425   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4426 
4427   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4428                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4429   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4430                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4431   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4432 
4433   // Track that we legalized this
4434   MI.setDesc(B.getTII().get(NewOpcode));
4435 
4436   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4437   // dmask to be at least 1 otherwise the instruction will fail
4438   if (IsTFE && DMask == 0) {
4439     DMask = 0x1;
4440     DMaskLanes = 1;
4441     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4442   }
4443 
4444   if (BaseOpcode->Atomic) {
4445     Register VData0 = MI.getOperand(2).getReg();
4446     LLT Ty = MRI->getType(VData0);
4447 
4448     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4449     if (Ty.isVector())
4450       return false;
4451 
4452     if (BaseOpcode->AtomicX2) {
4453       Register VData1 = MI.getOperand(3).getReg();
4454       // The two values are packed in one register.
4455       LLT PackedTy = LLT::fixed_vector(2, Ty);
4456       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4457       MI.getOperand(2).setReg(Concat.getReg(0));
4458       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4459     }
4460   }
4461 
4462   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4463 
4464   // Rewrite the addressing register layout before doing anything else.
4465   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4466     // 16 bit gradients are supported, but are tied to the A16 control
4467     // so both gradients and addresses must be 16 bit
4468     return false;
4469   }
4470 
4471   if (IsA16 && !ST.hasA16()) {
4472     // A16 not supported
4473     return false;
4474   }
4475 
4476   if (IsA16 || IsG16) {
4477     if (Intr->NumVAddrs > 1) {
4478       SmallVector<Register, 4> PackedRegs;
4479 
4480       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4481                                 IsG16);
4482 
4483       // See also below in the non-a16 branch
4484       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4485                           PackedRegs.size() <= ST.getNSAMaxSize();
4486 
4487       if (!UseNSA && PackedRegs.size() > 1) {
4488         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4489         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4490         PackedRegs[0] = Concat.getReg(0);
4491         PackedRegs.resize(1);
4492       }
4493 
4494       const unsigned NumPacked = PackedRegs.size();
4495       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4496         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4497         if (!SrcOp.isReg()) {
4498           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4499           continue;
4500         }
4501 
4502         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4503 
4504         if (I - Intr->VAddrStart < NumPacked)
4505           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4506         else
4507           SrcOp.setReg(AMDGPU::NoRegister);
4508       }
4509     }
4510   } else {
4511     // If the register allocator cannot place the address registers contiguously
4512     // without introducing moves, then using the non-sequential address encoding
4513     // is always preferable, since it saves VALU instructions and is usually a
4514     // wash in terms of code size or even better.
4515     //
4516     // However, we currently have no way of hinting to the register allocator
4517     // that MIMG addresses should be placed contiguously when it is possible to
4518     // do so, so force non-NSA for the common 2-address case as a heuristic.
4519     //
4520     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4521     // allocation when possible.
4522     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4523                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4524 
4525     if (!UseNSA && Intr->NumVAddrs > 1)
4526       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4527                                Intr->NumVAddrs);
4528   }
4529 
4530   int Flags = 0;
4531   if (IsA16)
4532     Flags |= 1;
4533   if (IsG16)
4534     Flags |= 2;
4535   MI.addOperand(MachineOperand::CreateImm(Flags));
4536 
4537   if (BaseOpcode->Store) { // No TFE for stores?
4538     // TODO: Handle dmask trim
4539     if (!Ty.isVector() || !IsD16)
4540       return true;
4541 
4542     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4543     if (RepackedReg != VData) {
4544       MI.getOperand(1).setReg(RepackedReg);
4545     }
4546 
4547     return true;
4548   }
4549 
4550   Register DstReg = MI.getOperand(0).getReg();
4551   const LLT EltTy = Ty.getScalarType();
4552   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4553 
4554   // Confirm that the return type is large enough for the dmask specified
4555   if (NumElts < DMaskLanes)
4556     return false;
4557 
4558   if (NumElts > 4 || DMaskLanes > 4)
4559     return false;
4560 
4561   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4562   const LLT AdjustedTy =
4563       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4564 
4565   // The raw dword aligned data component of the load. The only legal cases
4566   // where this matters should be when using the packed D16 format, for
4567   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4568   LLT RoundedTy;
4569 
4570   // S32 vector to to cover all data, plus TFE result element.
4571   LLT TFETy;
4572 
4573   // Register type to use for each loaded component. Will be S32 or V2S16.
4574   LLT RegTy;
4575 
4576   if (IsD16 && ST.hasUnpackedD16VMem()) {
4577     RoundedTy =
4578         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4579     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4580     RegTy = S32;
4581   } else {
4582     unsigned EltSize = EltTy.getSizeInBits();
4583     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4584     unsigned RoundedSize = 32 * RoundedElts;
4585     RoundedTy = LLT::scalarOrVector(
4586         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4587     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4588     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4589   }
4590 
4591   // The return type does not need adjustment.
4592   // TODO: Should we change s16 case to s32 or <2 x s16>?
4593   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4594     return true;
4595 
4596   Register Dst1Reg;
4597 
4598   // Insert after the instruction.
4599   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4600 
4601   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4602   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4603   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4604   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4605 
4606   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4607 
4608   MI.getOperand(0).setReg(NewResultReg);
4609 
4610   // In the IR, TFE is supposed to be used with a 2 element struct return
4611   // type. The instruction really returns these two values in one contiguous
4612   // register, with one additional dword beyond the loaded data. Rewrite the
4613   // return type to use a single register result.
4614 
4615   if (IsTFE) {
4616     Dst1Reg = MI.getOperand(1).getReg();
4617     if (MRI->getType(Dst1Reg) != S32)
4618       return false;
4619 
4620     // TODO: Make sure the TFE operand bit is set.
4621     MI.RemoveOperand(1);
4622 
4623     // Handle the easy case that requires no repack instructions.
4624     if (Ty == S32) {
4625       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4626       return true;
4627     }
4628   }
4629 
4630   // Now figure out how to copy the new result register back into the old
4631   // result.
4632   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4633 
4634   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4635 
4636   if (ResultNumRegs == 1) {
4637     assert(!IsTFE);
4638     ResultRegs[0] = NewResultReg;
4639   } else {
4640     // We have to repack into a new vector of some kind.
4641     for (int I = 0; I != NumDataRegs; ++I)
4642       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4643     B.buildUnmerge(ResultRegs, NewResultReg);
4644 
4645     // Drop the final TFE element to get the data part. The TFE result is
4646     // directly written to the right place already.
4647     if (IsTFE)
4648       ResultRegs.resize(NumDataRegs);
4649   }
4650 
4651   // For an s16 scalar result, we form an s32 result with a truncate regardless
4652   // of packed vs. unpacked.
4653   if (IsD16 && !Ty.isVector()) {
4654     B.buildTrunc(DstReg, ResultRegs[0]);
4655     return true;
4656   }
4657 
4658   // Avoid a build/concat_vector of 1 entry.
4659   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4660     B.buildBitcast(DstReg, ResultRegs[0]);
4661     return true;
4662   }
4663 
4664   assert(Ty.isVector());
4665 
4666   if (IsD16) {
4667     // For packed D16 results with TFE enabled, all the data components are
4668     // S32. Cast back to the expected type.
4669     //
4670     // TODO: We don't really need to use load s32 elements. We would only need one
4671     // cast for the TFE result if a multiple of v2s16 was used.
4672     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4673       for (Register &Reg : ResultRegs)
4674         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4675     } else if (ST.hasUnpackedD16VMem()) {
4676       for (Register &Reg : ResultRegs)
4677         Reg = B.buildTrunc(S16, Reg).getReg(0);
4678     }
4679   }
4680 
4681   auto padWithUndef = [&](LLT Ty, int NumElts) {
4682     if (NumElts == 0)
4683       return;
4684     Register Undef = B.buildUndef(Ty).getReg(0);
4685     for (int I = 0; I != NumElts; ++I)
4686       ResultRegs.push_back(Undef);
4687   };
4688 
4689   // Pad out any elements eliminated due to the dmask.
4690   LLT ResTy = MRI->getType(ResultRegs[0]);
4691   if (!ResTy.isVector()) {
4692     padWithUndef(ResTy, NumElts - ResultRegs.size());
4693     B.buildBuildVector(DstReg, ResultRegs);
4694     return true;
4695   }
4696 
4697   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4698   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4699 
4700   // Deal with the one annoying legal case.
4701   const LLT V3S16 = LLT::fixed_vector(3, 16);
4702   if (Ty == V3S16) {
4703     if (IsTFE) {
4704       if (ResultRegs.size() == 1) {
4705         NewResultReg = ResultRegs[0];
4706       } else if (ResultRegs.size() == 2) {
4707         LLT V4S16 = LLT::fixed_vector(4, 16);
4708         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4709       } else {
4710         return false;
4711       }
4712     }
4713 
4714     if (MRI->getType(DstReg).getNumElements() <
4715         MRI->getType(NewResultReg).getNumElements()) {
4716       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4717     } else {
4718       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4719     }
4720     return true;
4721   }
4722 
4723   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4724   B.buildConcatVectors(DstReg, ResultRegs);
4725   return true;
4726 }
4727 
4728 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4729   LegalizerHelper &Helper, MachineInstr &MI) const {
4730   MachineIRBuilder &B = Helper.MIRBuilder;
4731   GISelChangeObserver &Observer = Helper.Observer;
4732 
4733   Register Dst = MI.getOperand(0).getReg();
4734   LLT Ty = B.getMRI()->getType(Dst);
4735   unsigned Size = Ty.getSizeInBits();
4736   MachineFunction &MF = B.getMF();
4737 
4738   Observer.changingInstr(MI);
4739 
4740   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4741     Ty = getBitcastRegisterType(Ty);
4742     Helper.bitcastDst(MI, Ty, 0);
4743     Dst = MI.getOperand(0).getReg();
4744     B.setInsertPt(B.getMBB(), MI);
4745   }
4746 
4747   // FIXME: We don't really need this intermediate instruction. The intrinsic
4748   // should be fixed to have a memory operand. Since it's readnone, we're not
4749   // allowed to add one.
4750   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4751   MI.RemoveOperand(1); // Remove intrinsic ID
4752 
4753   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4754   // TODO: Should this use datalayout alignment?
4755   const unsigned MemSize = (Size + 7) / 8;
4756   const Align MemAlign(4);
4757   MachineMemOperand *MMO = MF.getMachineMemOperand(
4758       MachinePointerInfo(),
4759       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4760           MachineMemOperand::MOInvariant,
4761       MemSize, MemAlign);
4762   MI.addMemOperand(MF, MMO);
4763 
4764   // There are no 96-bit result scalar loads, but widening to 128-bit should
4765   // always be legal. We may need to restore this to a 96-bit result if it turns
4766   // out this needs to be converted to a vector load during RegBankSelect.
4767   if (!isPowerOf2_32(Size)) {
4768     if (Ty.isVector())
4769       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4770     else
4771       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4772   }
4773 
4774   Observer.changedInstr(MI);
4775   return true;
4776 }
4777 
4778 // TODO: Move to selection
4779 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4780                                                 MachineRegisterInfo &MRI,
4781                                                 MachineIRBuilder &B) const {
4782   if (!ST.isTrapHandlerEnabled() ||
4783       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4784     return legalizeTrapEndpgm(MI, MRI, B);
4785 
4786   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4787     switch (*HsaAbiVer) {
4788     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4789     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4790       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4791     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4792     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4793       return ST.supportsGetDoorbellID() ?
4794           legalizeTrapHsa(MI, MRI, B) :
4795           legalizeTrapHsaQueuePtr(MI, MRI, B);
4796     }
4797   }
4798 
4799   llvm_unreachable("Unknown trap handler");
4800 }
4801 
4802 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4803     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4804   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4805   MI.eraseFromParent();
4806   return true;
4807 }
4808 
4809 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4810     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4811   // Pass queue pointer to trap handler as input, and insert trap instruction
4812   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4813   Register LiveIn =
4814     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4815   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4816     return false;
4817 
4818   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4819   B.buildCopy(SGPR01, LiveIn);
4820   B.buildInstr(AMDGPU::S_TRAP)
4821       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4822       .addReg(SGPR01, RegState::Implicit);
4823 
4824   MI.eraseFromParent();
4825   return true;
4826 }
4827 
4828 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4829     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4830   B.buildInstr(AMDGPU::S_TRAP)
4831       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4832   MI.eraseFromParent();
4833   return true;
4834 }
4835 
4836 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4837     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4838   // Is non-HSA path or trap-handler disabled? Then, report a warning
4839   // accordingly
4840   if (!ST.isTrapHandlerEnabled() ||
4841       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4842     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4843                                      "debugtrap handler not supported",
4844                                      MI.getDebugLoc(), DS_Warning);
4845     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4846     Ctx.diagnose(NoTrap);
4847   } else {
4848     // Insert debug-trap instruction
4849     B.buildInstr(AMDGPU::S_TRAP)
4850         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4851   }
4852 
4853   MI.eraseFromParent();
4854   return true;
4855 }
4856 
4857 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4858                                                MachineIRBuilder &B) const {
4859   MachineRegisterInfo &MRI = *B.getMRI();
4860   const LLT S16 = LLT::scalar(16);
4861   const LLT S32 = LLT::scalar(32);
4862 
4863   Register DstReg = MI.getOperand(0).getReg();
4864   Register NodePtr = MI.getOperand(2).getReg();
4865   Register RayExtent = MI.getOperand(3).getReg();
4866   Register RayOrigin = MI.getOperand(4).getReg();
4867   Register RayDir = MI.getOperand(5).getReg();
4868   Register RayInvDir = MI.getOperand(6).getReg();
4869   Register TDescr = MI.getOperand(7).getReg();
4870 
4871   if (!ST.hasGFX10_AEncoding()) {
4872     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4873                                         "intrinsic not supported on subtarget",
4874                                         MI.getDebugLoc());
4875     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4876     return false;
4877   }
4878 
4879   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4880   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4881   const unsigned NumVDataDwords = 4;
4882   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4883   const bool UseNSA =
4884       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4885   const unsigned BaseOpcodes[2][2] = {
4886       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4887       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4888        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4889   int Opcode;
4890   if (UseNSA) {
4891     Opcode =
4892         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4893                               NumVDataDwords, NumVAddrDwords);
4894   } else {
4895     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4896                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4897                                    PowerOf2Ceil(NumVAddrDwords));
4898   }
4899   assert(Opcode != -1);
4900 
4901   SmallVector<Register, 12> Ops;
4902   if (Is64) {
4903     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4904     Ops.push_back(Unmerge.getReg(0));
4905     Ops.push_back(Unmerge.getReg(1));
4906   } else {
4907     Ops.push_back(NodePtr);
4908   }
4909   Ops.push_back(RayExtent);
4910 
4911   auto packLanes = [&Ops, &S32, &B](Register Src) {
4912     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4913     Ops.push_back(Unmerge.getReg(0));
4914     Ops.push_back(Unmerge.getReg(1));
4915     Ops.push_back(Unmerge.getReg(2));
4916   };
4917 
4918   packLanes(RayOrigin);
4919   if (IsA16) {
4920     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4921     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4922     Register R1 = MRI.createGenericVirtualRegister(S32);
4923     Register R2 = MRI.createGenericVirtualRegister(S32);
4924     Register R3 = MRI.createGenericVirtualRegister(S32);
4925     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4926     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4927     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4928     Ops.push_back(R1);
4929     Ops.push_back(R2);
4930     Ops.push_back(R3);
4931   } else {
4932     packLanes(RayDir);
4933     packLanes(RayInvDir);
4934   }
4935 
4936   if (!UseNSA) {
4937     // Build a single vector containing all the operands so far prepared.
4938     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4939     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4940     Ops.clear();
4941     Ops.push_back(MergedOps);
4942   }
4943 
4944   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4945     .addDef(DstReg)
4946     .addImm(Opcode);
4947 
4948   for (Register R : Ops) {
4949     MIB.addUse(R);
4950   }
4951 
4952   MIB.addUse(TDescr)
4953      .addImm(IsA16 ? 1 : 0)
4954      .cloneMemRefs(MI);
4955 
4956   MI.eraseFromParent();
4957   return true;
4958 }
4959 
4960 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4961   B.buildConstant(MI.getOperand(0).getReg(), C);
4962   MI.eraseFromParent();
4963   return true;
4964 }
4965 
4966 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4967                                             MachineInstr &MI) const {
4968   MachineIRBuilder &B = Helper.MIRBuilder;
4969   MachineRegisterInfo &MRI = *B.getMRI();
4970 
4971   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4972   auto IntrID = MI.getIntrinsicID();
4973   switch (IntrID) {
4974   case Intrinsic::amdgcn_if:
4975   case Intrinsic::amdgcn_else: {
4976     MachineInstr *Br = nullptr;
4977     MachineBasicBlock *UncondBrTarget = nullptr;
4978     bool Negated = false;
4979     if (MachineInstr *BrCond =
4980             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4981       const SIRegisterInfo *TRI
4982         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4983 
4984       Register Def = MI.getOperand(1).getReg();
4985       Register Use = MI.getOperand(3).getReg();
4986 
4987       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4988 
4989       if (Negated)
4990         std::swap(CondBrTarget, UncondBrTarget);
4991 
4992       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4993       if (IntrID == Intrinsic::amdgcn_if) {
4994         B.buildInstr(AMDGPU::SI_IF)
4995           .addDef(Def)
4996           .addUse(Use)
4997           .addMBB(UncondBrTarget);
4998       } else {
4999         B.buildInstr(AMDGPU::SI_ELSE)
5000             .addDef(Def)
5001             .addUse(Use)
5002             .addMBB(UncondBrTarget);
5003       }
5004 
5005       if (Br) {
5006         Br->getOperand(0).setMBB(CondBrTarget);
5007       } else {
5008         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5009         // since we're swapping branch targets it needs to be reinserted.
5010         // FIXME: IRTranslator should probably not do this
5011         B.buildBr(*CondBrTarget);
5012       }
5013 
5014       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5015       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5016       MI.eraseFromParent();
5017       BrCond->eraseFromParent();
5018       return true;
5019     }
5020 
5021     return false;
5022   }
5023   case Intrinsic::amdgcn_loop: {
5024     MachineInstr *Br = nullptr;
5025     MachineBasicBlock *UncondBrTarget = nullptr;
5026     bool Negated = false;
5027     if (MachineInstr *BrCond =
5028             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5029       const SIRegisterInfo *TRI
5030         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5031 
5032       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5033       Register Reg = MI.getOperand(2).getReg();
5034 
5035       if (Negated)
5036         std::swap(CondBrTarget, UncondBrTarget);
5037 
5038       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5039       B.buildInstr(AMDGPU::SI_LOOP)
5040         .addUse(Reg)
5041         .addMBB(UncondBrTarget);
5042 
5043       if (Br)
5044         Br->getOperand(0).setMBB(CondBrTarget);
5045       else
5046         B.buildBr(*CondBrTarget);
5047 
5048       MI.eraseFromParent();
5049       BrCond->eraseFromParent();
5050       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5051       return true;
5052     }
5053 
5054     return false;
5055   }
5056   case Intrinsic::amdgcn_kernarg_segment_ptr:
5057     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5058       // This only makes sense to call in a kernel, so just lower to null.
5059       B.buildConstant(MI.getOperand(0).getReg(), 0);
5060       MI.eraseFromParent();
5061       return true;
5062     }
5063 
5064     return legalizePreloadedArgIntrin(
5065       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5066   case Intrinsic::amdgcn_implicitarg_ptr:
5067     return legalizeImplicitArgPtr(MI, MRI, B);
5068   case Intrinsic::amdgcn_workitem_id_x:
5069     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5070       return replaceWithConstant(B, MI, 0);
5071     return legalizePreloadedArgIntrin(MI, MRI, B,
5072                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5073   case Intrinsic::amdgcn_workitem_id_y:
5074     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5075       return replaceWithConstant(B, MI, 0);
5076 
5077     return legalizePreloadedArgIntrin(MI, MRI, B,
5078                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5079   case Intrinsic::amdgcn_workitem_id_z:
5080     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5081       return replaceWithConstant(B, MI, 0);
5082 
5083     return legalizePreloadedArgIntrin(MI, MRI, B,
5084                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5085   case Intrinsic::amdgcn_workgroup_id_x:
5086     return legalizePreloadedArgIntrin(MI, MRI, B,
5087                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5088   case Intrinsic::amdgcn_workgroup_id_y:
5089     return legalizePreloadedArgIntrin(MI, MRI, B,
5090                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5091   case Intrinsic::amdgcn_workgroup_id_z:
5092     return legalizePreloadedArgIntrin(MI, MRI, B,
5093                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5094   case Intrinsic::amdgcn_dispatch_ptr:
5095     return legalizePreloadedArgIntrin(MI, MRI, B,
5096                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5097   case Intrinsic::amdgcn_queue_ptr:
5098     return legalizePreloadedArgIntrin(MI, MRI, B,
5099                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5100   case Intrinsic::amdgcn_implicit_buffer_ptr:
5101     return legalizePreloadedArgIntrin(
5102       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5103   case Intrinsic::amdgcn_dispatch_id:
5104     return legalizePreloadedArgIntrin(MI, MRI, B,
5105                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5106   case Intrinsic::amdgcn_fdiv_fast:
5107     return legalizeFDIVFastIntrin(MI, MRI, B);
5108   case Intrinsic::amdgcn_is_shared:
5109     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5110   case Intrinsic::amdgcn_is_private:
5111     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5112   case Intrinsic::amdgcn_wavefrontsize: {
5113     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5114     MI.eraseFromParent();
5115     return true;
5116   }
5117   case Intrinsic::amdgcn_s_buffer_load:
5118     return legalizeSBufferLoad(Helper, MI);
5119   case Intrinsic::amdgcn_raw_buffer_store:
5120   case Intrinsic::amdgcn_struct_buffer_store:
5121     return legalizeBufferStore(MI, MRI, B, false, false);
5122   case Intrinsic::amdgcn_raw_buffer_store_format:
5123   case Intrinsic::amdgcn_struct_buffer_store_format:
5124     return legalizeBufferStore(MI, MRI, B, false, true);
5125   case Intrinsic::amdgcn_raw_tbuffer_store:
5126   case Intrinsic::amdgcn_struct_tbuffer_store:
5127     return legalizeBufferStore(MI, MRI, B, true, true);
5128   case Intrinsic::amdgcn_raw_buffer_load:
5129   case Intrinsic::amdgcn_struct_buffer_load:
5130     return legalizeBufferLoad(MI, MRI, B, false, false);
5131   case Intrinsic::amdgcn_raw_buffer_load_format:
5132   case Intrinsic::amdgcn_struct_buffer_load_format:
5133     return legalizeBufferLoad(MI, MRI, B, true, false);
5134   case Intrinsic::amdgcn_raw_tbuffer_load:
5135   case Intrinsic::amdgcn_struct_tbuffer_load:
5136     return legalizeBufferLoad(MI, MRI, B, true, true);
5137   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5138   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5139   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5140   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5141   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5142   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5143   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5144   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5145   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5146   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5147   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5148   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5149   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5150   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5151   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5152   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5153   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5154   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5155   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5156   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5157   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5158   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5159   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5160   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5161   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5162   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5163   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5164   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5165   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5166   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5167     return legalizeBufferAtomic(MI, B, IntrID);
5168   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5169   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5170     Register DstReg = MI.getOperand(0).getReg();
5171     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5172       Function &F = B.getMF().getFunction();
5173       DiagnosticInfoUnsupported NoFpRet(
5174           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5175           DS_Error);
5176       F.getContext().diagnose(NoFpRet);
5177       B.buildUndef(DstReg);
5178       MI.eraseFromParent();
5179       return true;
5180     }
5181 
5182     return legalizeBufferAtomic(MI, B, IntrID);
5183   }
5184   case Intrinsic::amdgcn_atomic_inc:
5185     return legalizeAtomicIncDec(MI, B, true);
5186   case Intrinsic::amdgcn_atomic_dec:
5187     return legalizeAtomicIncDec(MI, B, false);
5188   case Intrinsic::trap:
5189     return legalizeTrapIntrinsic(MI, MRI, B);
5190   case Intrinsic::debugtrap:
5191     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5192   case Intrinsic::amdgcn_rsq_clamp:
5193     return legalizeRsqClampIntrinsic(MI, MRI, B);
5194   case Intrinsic::amdgcn_ds_fadd:
5195   case Intrinsic::amdgcn_ds_fmin:
5196   case Intrinsic::amdgcn_ds_fmax:
5197     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5198   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5199     return legalizeBVHIntrinsic(MI, B);
5200   default: {
5201     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5202             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5203       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5204     return true;
5205   }
5206   }
5207 
5208   return true;
5209 }
5210