1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
843       .customFor({S16, S32})
844       .scalarize(0)
845       .lower();
846 
847   // Lower roundeven into G_FRINT
848   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
849     .scalarize(0)
850     .lower();
851 
852   if (ST.has16BitInsts()) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S16, S32, S64})
855       .clampScalar(0, S16, S64)
856       .scalarize(0);
857   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32, S64})
860       .clampScalar(0, S32, S64)
861       .scalarize(0);
862   } else {
863     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
864       .legalFor({S32})
865       .customFor({S64})
866       .clampScalar(0, S32, S64)
867       .scalarize(0);
868   }
869 
870   getActionDefinitionsBuilder(G_PTR_ADD)
871     .legalIf(all(isPointer(0), sameSize(0, 1)))
872     .scalarize(0)
873     .scalarSameSizeAs(1, 0);
874 
875   getActionDefinitionsBuilder(G_PTRMASK)
876     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
877     .scalarSameSizeAs(1, 0)
878     .scalarize(0);
879 
880   auto &CmpBuilder =
881     getActionDefinitionsBuilder(G_ICMP)
882     // The compare output type differs based on the register bank of the output,
883     // so make both s1 and s32 legal.
884     //
885     // Scalar compares producing output in scc will be promoted to s32, as that
886     // is the allocatable register type that will be needed for the copy from
887     // scc. This will be promoted during RegBankSelect, and we assume something
888     // before that won't try to use s32 result types.
889     //
890     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
891     // bank.
892     .legalForCartesianProduct(
893       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
894     .legalForCartesianProduct(
895       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
896   if (ST.has16BitInsts()) {
897     CmpBuilder.legalFor({{S1, S16}});
898   }
899 
900   CmpBuilder
901     .widenScalarToNextPow2(1)
902     .clampScalar(1, S32, S64)
903     .scalarize(0)
904     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
905 
906   getActionDefinitionsBuilder(G_FCMP)
907     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
908     .widenScalarToNextPow2(1)
909     .clampScalar(1, S32, S64)
910     .scalarize(0);
911 
912   // FIXME: fpow has a selection pattern that should move to custom lowering.
913   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
914   if (ST.has16BitInsts())
915     Exp2Ops.legalFor({S32, S16});
916   else
917     Exp2Ops.legalFor({S32});
918   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
919   Exp2Ops.scalarize(0);
920 
921   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
922   if (ST.has16BitInsts())
923     ExpOps.customFor({{S32}, {S16}});
924   else
925     ExpOps.customFor({S32});
926   ExpOps.clampScalar(0, MinScalarFPTy, S32)
927         .scalarize(0);
928 
929   getActionDefinitionsBuilder(G_FPOWI)
930     .clampScalar(0, MinScalarFPTy, S32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder(G_CTPOP)
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .widenScalarToNextPow2(1, 32)
938     .clampScalar(1, S32, S64)
939     .scalarize(0)
940     .widenScalarToNextPow2(0, 32);
941 
942 
943   // The hardware instructions return a different result on 0 than the generic
944   // instructions expect. The hardware produces -1, but these produce the
945   // bitwidth.
946   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
947     .scalarize(0)
948     .clampScalar(0, S32, S32)
949     .clampScalar(1, S32, S64)
950     .widenScalarToNextPow2(0, 32)
951     .widenScalarToNextPow2(1, 32)
952     .custom();
953 
954   // The 64-bit versions produce 32-bit results, but only on the SALU.
955   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
956     .legalFor({{S32, S32}, {S32, S64}})
957     .clampScalar(0, S32, S32)
958     .clampScalar(1, S32, S64)
959     .scalarize(0)
960     .widenScalarToNextPow2(0, 32)
961     .widenScalarToNextPow2(1, 32);
962 
963   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
964   // RegBankSelect.
965   getActionDefinitionsBuilder(G_BITREVERSE)
966     .legalFor({S32, S64})
967     .clampScalar(0, S32, S64)
968     .scalarize(0)
969     .widenScalarToNextPow2(0);
970 
971   if (ST.has16BitInsts()) {
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S16, S32, V2S16})
974       .clampMaxNumElementsStrict(0, S16, 2)
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .clampScalar(0, S16, S32)
979       .scalarize(0);
980 
981     if (ST.hasVOP3PInsts()) {
982       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983         .legalFor({S32, S16, V2S16})
984         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
985         .clampMaxNumElements(0, S16, 2)
986         .minScalar(0, S16)
987         .widenScalarToNextPow2(0)
988         .scalarize(0)
989         .lower();
990     } else {
991       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
992         .legalFor({S32, S16})
993         .widenScalarToNextPow2(0)
994         .minScalar(0, S16)
995         .scalarize(0)
996         .lower();
997     }
998   } else {
999     // TODO: Should have same legality without v_perm_b32
1000     getActionDefinitionsBuilder(G_BSWAP)
1001       .legalFor({S32})
1002       .lowerIf(scalarNarrowerThan(0, 32))
1003       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004       // narrowScalar limitation.
1005       .widenScalarToNextPow2(0)
1006       .maxScalar(0, S32)
1007       .scalarize(0)
1008       .lower();
1009 
1010     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011       .legalFor({S32})
1012       .minScalar(0, S32)
1013       .widenScalarToNextPow2(0)
1014       .scalarize(0)
1015       .lower();
1016   }
1017 
1018   getActionDefinitionsBuilder(G_INTTOPTR)
1019     // List the common cases
1020     .legalForCartesianProduct(AddrSpaces64, {S64})
1021     .legalForCartesianProduct(AddrSpaces32, {S32})
1022     .scalarize(0)
1023     // Accept any address space as long as the size matches
1024     .legalIf(sameSize(0, 1))
1025     .widenScalarIf(smallerThan(1, 0),
1026       [](const LegalityQuery &Query) {
1027         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1028       })
1029     .narrowScalarIf(largerThan(1, 0),
1030       [](const LegalityQuery &Query) {
1031         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1032       });
1033 
1034   getActionDefinitionsBuilder(G_PTRTOINT)
1035     // List the common cases
1036     .legalForCartesianProduct(AddrSpaces64, {S64})
1037     .legalForCartesianProduct(AddrSpaces32, {S32})
1038     .scalarize(0)
1039     // Accept any address space as long as the size matches
1040     .legalIf(sameSize(0, 1))
1041     .widenScalarIf(smallerThan(0, 1),
1042       [](const LegalityQuery &Query) {
1043         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044       })
1045     .narrowScalarIf(
1046       largerThan(0, 1),
1047       [](const LegalityQuery &Query) {
1048         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1049       });
1050 
1051   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1052     .scalarize(0)
1053     .custom();
1054 
1055   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1056                                     bool IsLoad) -> bool {
1057     const LLT DstTy = Query.Types[0];
1058 
1059     // Split vector extloads.
1060     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1061 
1062     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1063       return true;
1064 
1065     const LLT PtrTy = Query.Types[1];
1066     unsigned AS = PtrTy.getAddressSpace();
1067     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1068       return true;
1069 
1070     // Catch weird sized loads that don't evenly divide into the access sizes
1071     // TODO: May be able to widen depending on alignment etc.
1072     unsigned NumRegs = (MemSize + 31) / 32;
1073     if (NumRegs == 3) {
1074       if (!ST.hasDwordx3LoadStores())
1075         return true;
1076     } else {
1077       // If the alignment allows, these should have been widened.
1078       if (!isPowerOf2_32(NumRegs))
1079         return true;
1080     }
1081 
1082     return false;
1083   };
1084 
1085   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1086   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1087   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1088 
1089   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1090   // LDS
1091   // TODO: Unsupported flat for SI.
1092 
1093   for (unsigned Op : {G_LOAD, G_STORE}) {
1094     const bool IsStore = Op == G_STORE;
1095 
1096     auto &Actions = getActionDefinitionsBuilder(Op);
1097     // Explicitly list some common cases.
1098     // TODO: Does this help compile time at all?
1099     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1100                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1101                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1102                                       {S64, GlobalPtr, S64, GlobalAlign32},
1103                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1104                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1105                                       {S32, GlobalPtr, S8, GlobalAlign8},
1106                                       {S32, GlobalPtr, S16, GlobalAlign16},
1107 
1108                                       {S32, LocalPtr, S32, 32},
1109                                       {S64, LocalPtr, S64, 32},
1110                                       {V2S32, LocalPtr, V2S32, 32},
1111                                       {S32, LocalPtr, S8, 8},
1112                                       {S32, LocalPtr, S16, 16},
1113                                       {V2S16, LocalPtr, S32, 32},
1114 
1115                                       {S32, PrivatePtr, S32, 32},
1116                                       {S32, PrivatePtr, S8, 8},
1117                                       {S32, PrivatePtr, S16, 16},
1118                                       {V2S16, PrivatePtr, S32, 32},
1119 
1120                                       {S32, ConstantPtr, S32, GlobalAlign32},
1121                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1122                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1123                                       {S64, ConstantPtr, S64, GlobalAlign32},
1124                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1125     Actions.legalIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return isLoadStoreLegal(ST, Query);
1128       });
1129 
1130     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1131     // 64-bits.
1132     //
1133     // TODO: Should generalize bitcast action into coerce, which will also cover
1134     // inserting addrspacecasts.
1135     Actions.customIf(typeIs(1, Constant32Ptr));
1136 
1137     // Turn any illegal element vectors into something easier to deal
1138     // with. These will ultimately produce 32-bit scalar shifts to extract the
1139     // parts anyway.
1140     //
1141     // For odd 16-bit element vectors, prefer to split those into pieces with
1142     // 16-bit vector parts.
1143     Actions.bitcastIf(
1144       [=](const LegalityQuery &Query) -> bool {
1145         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1146                                           Query.MMODescrs[0].MemoryTy);
1147       }, bitcastToRegisterType(0));
1148 
1149     if (!IsStore) {
1150       // Widen suitably aligned loads by loading extra bytes. The standard
1151       // legalization actions can't properly express widening memory operands.
1152       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1153         return shouldWidenLoad(ST, Query, G_LOAD);
1154       });
1155     }
1156 
1157     // FIXME: load/store narrowing should be moved to lower action
1158     Actions
1159         .narrowScalarIf(
1160             [=](const LegalityQuery &Query) -> bool {
1161               return !Query.Types[0].isVector() &&
1162                      needToSplitMemOp(Query, Op == G_LOAD);
1163             },
1164             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1165               const LLT DstTy = Query.Types[0];
1166               const LLT PtrTy = Query.Types[1];
1167 
1168               const unsigned DstSize = DstTy.getSizeInBits();
1169               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1170 
1171               // Split extloads.
1172               if (DstSize > MemSize)
1173                 return std::make_pair(0, LLT::scalar(MemSize));
1174 
1175               unsigned MaxSize = maxSizeForAddrSpace(ST,
1176                                                      PtrTy.getAddressSpace(),
1177                                                      Op == G_LOAD);
1178               if (MemSize > MaxSize)
1179                 return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1182               return std::make_pair(0, LLT::scalar(Align));
1183             })
1184         .fewerElementsIf(
1185             [=](const LegalityQuery &Query) -> bool {
1186               return Query.Types[0].isVector() &&
1187                      needToSplitMemOp(Query, Op == G_LOAD);
1188             },
1189             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190               const LLT DstTy = Query.Types[0];
1191               const LLT PtrTy = Query.Types[1];
1192 
1193               LLT EltTy = DstTy.getElementType();
1194               unsigned MaxSize = maxSizeForAddrSpace(ST,
1195                                                      PtrTy.getAddressSpace(),
1196                                                      Op == G_LOAD);
1197 
1198               // FIXME: Handle widened to power of 2 results better. This ends
1199               // up scalarizing.
1200               // FIXME: 3 element stores scalarized on SI
1201 
1202               // Split if it's too large for the address space.
1203               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1204               if (MemSize > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                       0, LLT::scalarOrVector(
1211                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1212                 }
1213 
1214                 unsigned NumPieces = MemSize / MaxSize;
1215 
1216                 // FIXME: Refine when odd breakdowns handled
1217                 // The scalars will need to be re-legalized.
1218                 if (NumPieces == 1 || NumPieces >= NumElts ||
1219                     NumElts % NumPieces != 0)
1220                   return std::make_pair(0, EltTy);
1221 
1222                 return std::make_pair(
1223                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1224               }
1225 
1226               // FIXME: We could probably handle weird extending loads better.
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                     0, LLT::scalarOrVector(
1239                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1240               }
1241 
1242               // May need relegalization for the scalars.
1243               return std::make_pair(0, EltTy);
1244             })
1245     .minScalar(0, S32)
1246     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1247     .widenScalarToNextPow2(0)
1248     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1249     .lower();
1250   }
1251 
1252   // FIXME: Unaligned accesses not lowered.
1253   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1254                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1255                                                   {S32, GlobalPtr, S16, 2 * 8},
1256                                                   {S32, LocalPtr, S8, 8},
1257                                                   {S32, LocalPtr, S16, 16},
1258                                                   {S32, PrivatePtr, S8, 8},
1259                                                   {S32, PrivatePtr, S16, 16},
1260                                                   {S32, ConstantPtr, S8, 8},
1261                                                   {S32, ConstantPtr, S16, 2 * 8}})
1262                        .legalIf(
1263                          [=](const LegalityQuery &Query) -> bool {
1264                            return isLoadStoreLegal(ST, Query);
1265                          });
1266 
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1270   }
1271 
1272   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1273   // 64-bits.
1274   //
1275   // TODO: Should generalize bitcast action into coerce, which will also cover
1276   // inserting addrspacecasts.
1277   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1278 
1279   ExtLoads.clampScalar(0, S32, S32)
1280           .widenScalarToNextPow2(0)
1281           .lower();
1282 
1283   auto &Atomics = getActionDefinitionsBuilder(
1284     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287      G_ATOMICRMW_UMIN})
1288     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289                {S64, GlobalPtr}, {S64, LocalPtr},
1290                {S32, RegionPtr}, {S64, RegionPtr}});
1291   if (ST.hasFlatAddressSpace()) {
1292     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293   }
1294 
1295   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296   if (ST.hasLDSFPAtomicAdd()) {
1297     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298     if (ST.hasGFX90AInsts())
1299       Atomic.legalFor({{S64, LocalPtr}});
1300     if (ST.hasGFX940Insts())
1301       Atomic.legalFor({{V2S16, LocalPtr}});
1302   }
1303   if (ST.hasAtomicFaddInsts())
1304     Atomic.legalFor({{S32, GlobalPtr}});
1305 
1306   if (ST.hasGFX90AInsts()) {
1307     // These are legal with some caveats, and should have undergone expansion in
1308     // the IR in most situations
1309     // TODO: Move atomic expansion into legalizer
1310     // TODO: Also supports <2 x f16>
1311     Atomic.legalFor({
1312         {S32, GlobalPtr},
1313         {S64, GlobalPtr},
1314         {S64, FlatPtr}
1315       });
1316   }
1317 
1318   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1319   // demarshalling
1320   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1321     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1322                 {S32, FlatPtr}, {S64, FlatPtr}})
1323     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1324                {S32, RegionPtr}, {S64, RegionPtr}});
1325   // TODO: Pointer types, any 32-bit or 64-bit vector
1326 
1327   // Condition should be s32 for scalar, s1 for vector.
1328   getActionDefinitionsBuilder(G_SELECT)
1329       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1330                                  LocalPtr, FlatPtr, PrivatePtr,
1331                                  LLT::fixed_vector(2, LocalPtr),
1332                                  LLT::fixed_vector(2, PrivatePtr)},
1333                                 {S1, S32})
1334       .clampScalar(0, S16, S64)
1335       .scalarize(1)
1336       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1337       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1338       .clampMaxNumElements(0, S32, 2)
1339       .clampMaxNumElements(0, LocalPtr, 2)
1340       .clampMaxNumElements(0, PrivatePtr, 2)
1341       .scalarize(0)
1342       .widenScalarToNextPow2(0)
1343       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1344 
1345   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1346   // be more flexible with the shift amount type.
1347   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1348     .legalFor({{S32, S32}, {S64, S32}});
1349   if (ST.has16BitInsts()) {
1350     if (ST.hasVOP3PInsts()) {
1351       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1352             .clampMaxNumElements(0, S16, 2);
1353     } else
1354       Shifts.legalFor({{S16, S16}});
1355 
1356     // TODO: Support 16-bit shift amounts for all types
1357     Shifts.widenScalarIf(
1358       [=](const LegalityQuery &Query) {
1359         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1360         // 32-bit amount.
1361         const LLT ValTy = Query.Types[0];
1362         const LLT AmountTy = Query.Types[1];
1363         return ValTy.getSizeInBits() <= 16 &&
1364                AmountTy.getSizeInBits() < 16;
1365       }, changeTo(1, S16));
1366     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1367     Shifts.clampScalar(1, S32, S32);
1368     Shifts.widenScalarToNextPow2(0, 16);
1369     Shifts.clampScalar(0, S16, S64);
1370 
1371     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1372       .minScalar(0, S16)
1373       .scalarize(0)
1374       .lower();
1375   } else {
1376     // Make sure we legalize the shift amount type first, as the general
1377     // expansion for the shifted type will produce much worse code if it hasn't
1378     // been truncated already.
1379     Shifts.clampScalar(1, S32, S32);
1380     Shifts.widenScalarToNextPow2(0, 32);
1381     Shifts.clampScalar(0, S32, S64);
1382 
1383     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384       .minScalar(0, S32)
1385       .scalarize(0)
1386       .lower();
1387   }
1388   Shifts.scalarize(0);
1389 
1390   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1391     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1392     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1393     unsigned IdxTypeIdx = 2;
1394 
1395     getActionDefinitionsBuilder(Op)
1396       .customIf([=](const LegalityQuery &Query) {
1397           const LLT EltTy = Query.Types[EltTypeIdx];
1398           const LLT VecTy = Query.Types[VecTypeIdx];
1399           const LLT IdxTy = Query.Types[IdxTypeIdx];
1400           const unsigned EltSize = EltTy.getSizeInBits();
1401           return (EltSize == 32 || EltSize == 64) &&
1402                   VecTy.getSizeInBits() % 32 == 0 &&
1403                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1404                   IdxTy.getSizeInBits() == 32;
1405         })
1406       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1407                  bitcastToVectorElement32(VecTypeIdx))
1408       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1409       .bitcastIf(
1410         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1411         [=](const LegalityQuery &Query) {
1412           // For > 64-bit element types, try to turn this into a 64-bit
1413           // element vector since we may be able to do better indexing
1414           // if this is scalar. If not, fall back to 32.
1415           const LLT EltTy = Query.Types[EltTypeIdx];
1416           const LLT VecTy = Query.Types[VecTypeIdx];
1417           const unsigned DstEltSize = EltTy.getSizeInBits();
1418           const unsigned VecSize = VecTy.getSizeInBits();
1419 
1420           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1421           return std::make_pair(
1422               VecTypeIdx,
1423               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1424         })
1425       .clampScalar(EltTypeIdx, S32, S64)
1426       .clampScalar(VecTypeIdx, S32, S64)
1427       .clampScalar(IdxTypeIdx, S32, S32)
1428       .clampMaxNumElements(VecTypeIdx, S32, 32)
1429       // TODO: Clamp elements for 64-bit vectors?
1430       // It should only be necessary with variable indexes.
1431       // As a last resort, lower to the stack
1432       .lower();
1433   }
1434 
1435   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436     .unsupportedIf([=](const LegalityQuery &Query) {
1437         const LLT &EltTy = Query.Types[1].getElementType();
1438         return Query.Types[0] != EltTy;
1439       });
1440 
1441   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445     // FIXME: Doesn't handle extract of illegal sizes.
1446     getActionDefinitionsBuilder(Op)
1447       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448       .lowerIf([=](const LegalityQuery &Query) {
1449           // Sub-vector(or single element) insert and extract.
1450           // TODO: verify immediate offset here since lower only works with
1451           // whole elements.
1452           const LLT BigTy = Query.Types[BigTyIdx];
1453           return BigTy.isVector();
1454         })
1455       // FIXME: Multiples of 16 should not be legal.
1456       .legalIf([=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           const LLT LitTy = Query.Types[LitTyIdx];
1459           return (BigTy.getSizeInBits() % 32 == 0) &&
1460                  (LitTy.getSizeInBits() % 16 == 0);
1461         })
1462       .widenScalarIf(
1463         [=](const LegalityQuery &Query) {
1464           const LLT BigTy = Query.Types[BigTyIdx];
1465           return (BigTy.getScalarSizeInBits() < 16);
1466         },
1467         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1468       .widenScalarIf(
1469         [=](const LegalityQuery &Query) {
1470           const LLT LitTy = Query.Types[LitTyIdx];
1471           return (LitTy.getScalarSizeInBits() < 16);
1472         },
1473         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1474       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1475       .widenScalarToNextPow2(BigTyIdx, 32);
1476 
1477   }
1478 
1479   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1480     .legalForCartesianProduct(AllS32Vectors, {S32})
1481     .legalForCartesianProduct(AllS64Vectors, {S64})
1482     .clampNumElements(0, V16S32, V32S32)
1483     .clampNumElements(0, V2S64, V16S64)
1484     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1485 
1486   if (ST.hasScalarPackInsts()) {
1487     BuildVector
1488       // FIXME: Should probably widen s1 vectors straight to s32
1489       .minScalarOrElt(0, S16)
1490       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1491       .minScalar(1, S32);
1492 
1493     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1494       .legalFor({V2S16, S32})
1495       .lower();
1496     BuildVector.minScalarOrElt(0, S32);
1497   } else {
1498     BuildVector.customFor({V2S16, S16});
1499     BuildVector.minScalarOrElt(0, S32);
1500 
1501     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1502       .customFor({V2S16, S32})
1503       .lower();
1504   }
1505 
1506   BuildVector.legalIf(isRegisterType(0));
1507 
1508   // FIXME: Clamp maximum size
1509   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1510     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1511     .clampMaxNumElements(0, S32, 32)
1512     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1513     .clampMaxNumElements(0, S16, 64);
1514 
1515   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1516   // pre-legalize.
1517   if (ST.hasVOP3PInsts()) {
1518     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1519       .customFor({V2S16, V2S16})
1520       .lower();
1521   } else
1522     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1523 
1524   // Merge/Unmerge
1525   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1526     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1527     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1528 
1529     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1530       const LLT Ty = Query.Types[TypeIdx];
1531       if (Ty.isVector()) {
1532         const LLT &EltTy = Ty.getElementType();
1533         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1534           return true;
1535         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1536           return true;
1537       }
1538       return false;
1539     };
1540 
1541     auto &Builder = getActionDefinitionsBuilder(Op)
1542       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1543       .lowerFor({{S16, V2S16}})
1544       .lowerIf([=](const LegalityQuery &Query) {
1545           const LLT BigTy = Query.Types[BigTyIdx];
1546           return BigTy.getSizeInBits() == 32;
1547         })
1548       // Try to widen to s16 first for small types.
1549       // TODO: Only do this on targets with legal s16 shifts
1550       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1551       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1552       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1553       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1554                            elementTypeIs(1, S16)),
1555                        changeTo(1, V2S16))
1556       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1557       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1558       // valid.
1559       .clampScalar(LitTyIdx, S32, S512)
1560       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1561       // Break up vectors with weird elements into scalars
1562       .fewerElementsIf(
1563         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1564         scalarize(0))
1565       .fewerElementsIf(
1566         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1567         scalarize(1))
1568       .clampScalar(BigTyIdx, S32, MaxScalar);
1569 
1570     if (Op == G_MERGE_VALUES) {
1571       Builder.widenScalarIf(
1572         // TODO: Use 16-bit shifts if legal for 8-bit values?
1573         [=](const LegalityQuery &Query) {
1574           const LLT Ty = Query.Types[LitTyIdx];
1575           return Ty.getSizeInBits() < 32;
1576         },
1577         changeTo(LitTyIdx, S32));
1578     }
1579 
1580     Builder.widenScalarIf(
1581       [=](const LegalityQuery &Query) {
1582         const LLT Ty = Query.Types[BigTyIdx];
1583         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1584           Ty.getSizeInBits() % 16 != 0;
1585       },
1586       [=](const LegalityQuery &Query) {
1587         // Pick the next power of 2, or a multiple of 64 over 128.
1588         // Whichever is smaller.
1589         const LLT &Ty = Query.Types[BigTyIdx];
1590         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1591         if (NewSizeInBits >= 256) {
1592           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1593           if (RoundedTo < NewSizeInBits)
1594             NewSizeInBits = RoundedTo;
1595         }
1596         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1597       })
1598       // Any vectors left are the wrong size. Scalarize them.
1599       .scalarize(0)
1600       .scalarize(1);
1601   }
1602 
1603   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1604   // RegBankSelect.
1605   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1606     .legalFor({{S32}, {S64}});
1607 
1608   if (ST.hasVOP3PInsts()) {
1609     SextInReg.lowerFor({{V2S16}})
1610       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1611       // get more vector shift opportunities, since we'll get those when
1612       // expanded.
1613       .clampMaxNumElementsStrict(0, S16, 2);
1614   } else if (ST.has16BitInsts()) {
1615     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1616   } else {
1617     // Prefer to promote to s32 before lowering if we don't have 16-bit
1618     // shifts. This avoid a lot of intermediate truncate and extend operations.
1619     SextInReg.lowerFor({{S32}, {S64}});
1620   }
1621 
1622   SextInReg
1623     .scalarize(0)
1624     .clampScalar(0, S32, S64)
1625     .lower();
1626 
1627   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1628     .scalarize(0)
1629     .lower();
1630 
1631   // TODO: Only Try to form v2s16 with legal packed instructions.
1632   getActionDefinitionsBuilder(G_FSHR)
1633     .legalFor({{S32, S32}})
1634     .lowerFor({{V2S16, V2S16}})
1635     .clampMaxNumElementsStrict(0, S16, 2)
1636     .scalarize(0)
1637     .lower();
1638 
1639   if (ST.hasVOP3PInsts()) {
1640     getActionDefinitionsBuilder(G_FSHL)
1641       .lowerFor({{V2S16, V2S16}})
1642       .clampMaxNumElementsStrict(0, S16, 2)
1643       .scalarize(0)
1644       .lower();
1645   } else {
1646     getActionDefinitionsBuilder(G_FSHL)
1647       .scalarize(0)
1648       .lower();
1649   }
1650 
1651   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652     .legalFor({S64});
1653 
1654   getActionDefinitionsBuilder(G_FENCE)
1655     .alwaysLegal();
1656 
1657   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658       .scalarize(0)
1659       .minScalar(0, S32)
1660       .lower();
1661 
1662   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663       .legalFor({{S32, S32}, {S64, S32}})
1664       .clampScalar(1, S32, S32)
1665       .clampScalar(0, S32, S64)
1666       .widenScalarToNextPow2(0)
1667       .scalarize(0);
1668 
1669   getActionDefinitionsBuilder({
1670       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671       G_FCOPYSIGN,
1672 
1673       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674       G_ATOMICRMW_NAND,
1675       G_ATOMICRMW_FSUB,
1676       G_READ_REGISTER,
1677       G_WRITE_REGISTER,
1678 
1679       G_SADDO, G_SSUBO,
1680 
1681        // TODO: Implement
1682       G_FMINIMUM, G_FMAXIMUM}).lower();
1683 
1684   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1685       .lower();
1686 
1687   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1688         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1689         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1690     .unsupported();
1691 
1692   getLegacyLegalizerInfo().computeTables();
1693   verify(*ST.getInstrInfo());
1694 }
1695 
1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1697                                          MachineInstr &MI) const {
1698   MachineIRBuilder &B = Helper.MIRBuilder;
1699   MachineRegisterInfo &MRI = *B.getMRI();
1700 
1701   switch (MI.getOpcode()) {
1702   case TargetOpcode::G_ADDRSPACE_CAST:
1703     return legalizeAddrSpaceCast(MI, MRI, B);
1704   case TargetOpcode::G_FRINT:
1705     return legalizeFrint(MI, MRI, B);
1706   case TargetOpcode::G_FCEIL:
1707     return legalizeFceil(MI, MRI, B);
1708   case TargetOpcode::G_FREM:
1709     return legalizeFrem(MI, MRI, B);
1710   case TargetOpcode::G_INTRINSIC_TRUNC:
1711     return legalizeIntrinsicTrunc(MI, MRI, B);
1712   case TargetOpcode::G_SITOFP:
1713     return legalizeITOFP(MI, MRI, B, true);
1714   case TargetOpcode::G_UITOFP:
1715     return legalizeITOFP(MI, MRI, B, false);
1716   case TargetOpcode::G_FPTOSI:
1717     return legalizeFPTOI(MI, MRI, B, true);
1718   case TargetOpcode::G_FPTOUI:
1719     return legalizeFPTOI(MI, MRI, B, false);
1720   case TargetOpcode::G_FMINNUM:
1721   case TargetOpcode::G_FMAXNUM:
1722   case TargetOpcode::G_FMINNUM_IEEE:
1723   case TargetOpcode::G_FMAXNUM_IEEE:
1724     return legalizeMinNumMaxNum(Helper, MI);
1725   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1726     return legalizeExtractVectorElt(MI, MRI, B);
1727   case TargetOpcode::G_INSERT_VECTOR_ELT:
1728     return legalizeInsertVectorElt(MI, MRI, B);
1729   case TargetOpcode::G_SHUFFLE_VECTOR:
1730     return legalizeShuffleVector(MI, MRI, B);
1731   case TargetOpcode::G_FSIN:
1732   case TargetOpcode::G_FCOS:
1733     return legalizeSinCos(MI, MRI, B);
1734   case TargetOpcode::G_GLOBAL_VALUE:
1735     return legalizeGlobalValue(MI, MRI, B);
1736   case TargetOpcode::G_LOAD:
1737   case TargetOpcode::G_SEXTLOAD:
1738   case TargetOpcode::G_ZEXTLOAD:
1739     return legalizeLoad(Helper, MI);
1740   case TargetOpcode::G_FMAD:
1741     return legalizeFMad(MI, MRI, B);
1742   case TargetOpcode::G_FDIV:
1743     return legalizeFDIV(MI, MRI, B);
1744   case TargetOpcode::G_UDIV:
1745   case TargetOpcode::G_UREM:
1746   case TargetOpcode::G_UDIVREM:
1747     return legalizeUnsignedDIV_REM(MI, MRI, B);
1748   case TargetOpcode::G_SDIV:
1749   case TargetOpcode::G_SREM:
1750   case TargetOpcode::G_SDIVREM:
1751     return legalizeSignedDIV_REM(MI, MRI, B);
1752   case TargetOpcode::G_ATOMIC_CMPXCHG:
1753     return legalizeAtomicCmpXChg(MI, MRI, B);
1754   case TargetOpcode::G_FLOG:
1755     return legalizeFlog(MI, B, numbers::ln2f);
1756   case TargetOpcode::G_FLOG10:
1757     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1758   case TargetOpcode::G_FEXP:
1759     return legalizeFExp(MI, B);
1760   case TargetOpcode::G_FPOW:
1761     return legalizeFPow(MI, B);
1762   case TargetOpcode::G_FFLOOR:
1763     return legalizeFFloor(MI, MRI, B);
1764   case TargetOpcode::G_BUILD_VECTOR:
1765     return legalizeBuildVector(MI, MRI, B);
1766   case TargetOpcode::G_CTLZ:
1767   case TargetOpcode::G_CTTZ:
1768     return legalizeCTLZ_CTTZ(MI, MRI, B);
1769   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1770     return legalizeFPTruncRound(MI, B);
1771   default:
1772     return false;
1773   }
1774 
1775   llvm_unreachable("expected switch to return");
1776 }
1777 
1778 Register AMDGPULegalizerInfo::getSegmentAperture(
1779   unsigned AS,
1780   MachineRegisterInfo &MRI,
1781   MachineIRBuilder &B) const {
1782   MachineFunction &MF = B.getMF();
1783   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1784   const LLT S32 = LLT::scalar(32);
1785 
1786   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1787 
1788   if (ST.hasApertureRegs()) {
1789     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1790     // getreg.
1791     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1792         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1793         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1794     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1795         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1796         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1797     unsigned Encoding =
1798         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1799         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1800         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1801 
1802     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1803 
1804     B.buildInstr(AMDGPU::S_GETREG_B32)
1805       .addDef(GetReg)
1806       .addImm(Encoding);
1807     MRI.setType(GetReg, S32);
1808 
1809     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1810     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1811   }
1812 
1813   Register QueuePtr = MRI.createGenericVirtualRegister(
1814     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1815 
1816   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1817     return Register();
1818 
1819   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1820   // private_segment_aperture_base_hi.
1821   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1822 
1823   // TODO: can we be smarter about machine pointer info?
1824   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1825   MachineMemOperand *MMO = MF.getMachineMemOperand(
1826       PtrInfo,
1827       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1828           MachineMemOperand::MOInvariant,
1829       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1830 
1831   Register LoadAddr;
1832 
1833   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1834   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1835 }
1836 
1837 /// Return true if the value is a known valid address, such that a null check is
1838 /// not necessary.
1839 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1840                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1841   MachineInstr *Def = MRI.getVRegDef(Val);
1842   switch (Def->getOpcode()) {
1843   case AMDGPU::G_FRAME_INDEX:
1844   case AMDGPU::G_GLOBAL_VALUE:
1845   case AMDGPU::G_BLOCK_ADDR:
1846     return true;
1847   case AMDGPU::G_CONSTANT: {
1848     const ConstantInt *CI = Def->getOperand(1).getCImm();
1849     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1850   }
1851   default:
1852     return false;
1853   }
1854 
1855   return false;
1856 }
1857 
1858 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1859   MachineInstr &MI, MachineRegisterInfo &MRI,
1860   MachineIRBuilder &B) const {
1861   MachineFunction &MF = B.getMF();
1862 
1863   const LLT S32 = LLT::scalar(32);
1864   Register Dst = MI.getOperand(0).getReg();
1865   Register Src = MI.getOperand(1).getReg();
1866 
1867   LLT DstTy = MRI.getType(Dst);
1868   LLT SrcTy = MRI.getType(Src);
1869   unsigned DestAS = DstTy.getAddressSpace();
1870   unsigned SrcAS = SrcTy.getAddressSpace();
1871 
1872   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1873   // vector element.
1874   assert(!DstTy.isVector());
1875 
1876   const AMDGPUTargetMachine &TM
1877     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1878 
1879   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1880     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1881     return true;
1882   }
1883 
1884   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1885       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1886        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1887     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1888       // Extract low 32-bits of the pointer.
1889       B.buildExtract(Dst, Src, 0);
1890       MI.eraseFromParent();
1891       return true;
1892     }
1893 
1894     unsigned NullVal = TM.getNullPointerValue(DestAS);
1895 
1896     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1897     auto FlatNull = B.buildConstant(SrcTy, 0);
1898 
1899     // Extract low 32-bits of the pointer.
1900     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1901 
1902     auto CmpRes =
1903         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1904     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1905 
1906     MI.eraseFromParent();
1907     return true;
1908   }
1909 
1910   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1911       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1912        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1913     if (!ST.hasFlatAddressSpace())
1914       return false;
1915 
1916     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1917     if (!ApertureReg.isValid())
1918       return false;
1919 
1920     // Coerce the type of the low half of the result so we can use merge_values.
1921     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1922 
1923     // TODO: Should we allow mismatched types but matching sizes in merges to
1924     // avoid the ptrtoint?
1925     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1926 
1927     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1928       B.buildCopy(Dst, BuildPtr);
1929       MI.eraseFromParent();
1930       return true;
1931     }
1932 
1933     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1934     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1935 
1936     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1937                               SegmentNull.getReg(0));
1938 
1939     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1940 
1941     MI.eraseFromParent();
1942     return true;
1943   }
1944 
1945   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1946       SrcTy.getSizeInBits() == 64) {
1947     // Truncate.
1948     B.buildExtract(Dst, Src, 0);
1949     MI.eraseFromParent();
1950     return true;
1951   }
1952 
1953   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1954       DstTy.getSizeInBits() == 64) {
1955     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1956     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1957 
1958     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1959     // another. Merge operands are required to be the same type, but creating an
1960     // extra ptrtoint would be kind of pointless.
1961     auto HighAddr = B.buildConstant(
1962         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1963     B.buildMerge(Dst, {Src, HighAddr});
1964     MI.eraseFromParent();
1965     return true;
1966   }
1967 
1968   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1969       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
1970 
1971   LLVMContext &Ctx = MF.getFunction().getContext();
1972   Ctx.diagnose(InvalidAddrSpaceCast);
1973   B.buildUndef(Dst);
1974   MI.eraseFromParent();
1975   return true;
1976 }
1977 
1978 bool AMDGPULegalizerInfo::legalizeFrint(
1979   MachineInstr &MI, MachineRegisterInfo &MRI,
1980   MachineIRBuilder &B) const {
1981   Register Src = MI.getOperand(1).getReg();
1982   LLT Ty = MRI.getType(Src);
1983   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1984 
1985   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1986   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1987 
1988   auto C1 = B.buildFConstant(Ty, C1Val);
1989   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1990 
1991   // TODO: Should this propagate fast-math-flags?
1992   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1993   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1994 
1995   auto C2 = B.buildFConstant(Ty, C2Val);
1996   auto Fabs = B.buildFAbs(Ty, Src);
1997 
1998   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1999   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2000   MI.eraseFromParent();
2001   return true;
2002 }
2003 
2004 bool AMDGPULegalizerInfo::legalizeFceil(
2005   MachineInstr &MI, MachineRegisterInfo &MRI,
2006   MachineIRBuilder &B) const {
2007 
2008   const LLT S1 = LLT::scalar(1);
2009   const LLT S64 = LLT::scalar(64);
2010 
2011   Register Src = MI.getOperand(1).getReg();
2012   assert(MRI.getType(Src) == S64);
2013 
2014   // result = trunc(src)
2015   // if (src > 0.0 && src != result)
2016   //   result += 1.0
2017 
2018   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2019 
2020   const auto Zero = B.buildFConstant(S64, 0.0);
2021   const auto One = B.buildFConstant(S64, 1.0);
2022   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2023   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2024   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2025   auto Add = B.buildSelect(S64, And, One, Zero);
2026 
2027   // TODO: Should this propagate fast-math-flags?
2028   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2029   MI.eraseFromParent();
2030   return true;
2031 }
2032 
2033 bool AMDGPULegalizerInfo::legalizeFrem(
2034   MachineInstr &MI, MachineRegisterInfo &MRI,
2035   MachineIRBuilder &B) const {
2036     Register DstReg = MI.getOperand(0).getReg();
2037     Register Src0Reg = MI.getOperand(1).getReg();
2038     Register Src1Reg = MI.getOperand(2).getReg();
2039     auto Flags = MI.getFlags();
2040     LLT Ty = MRI.getType(DstReg);
2041 
2042     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2043     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2044     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2045     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2046     MI.eraseFromParent();
2047     return true;
2048 }
2049 
2050 static MachineInstrBuilder extractF64Exponent(Register Hi,
2051                                               MachineIRBuilder &B) {
2052   const unsigned FractBits = 52;
2053   const unsigned ExpBits = 11;
2054   LLT S32 = LLT::scalar(32);
2055 
2056   auto Const0 = B.buildConstant(S32, FractBits - 32);
2057   auto Const1 = B.buildConstant(S32, ExpBits);
2058 
2059   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2060     .addUse(Hi)
2061     .addUse(Const0.getReg(0))
2062     .addUse(Const1.getReg(0));
2063 
2064   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2065 }
2066 
2067 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2068   MachineInstr &MI, MachineRegisterInfo &MRI,
2069   MachineIRBuilder &B) const {
2070   const LLT S1 = LLT::scalar(1);
2071   const LLT S32 = LLT::scalar(32);
2072   const LLT S64 = LLT::scalar(64);
2073 
2074   Register Src = MI.getOperand(1).getReg();
2075   assert(MRI.getType(Src) == S64);
2076 
2077   // TODO: Should this use extract since the low half is unused?
2078   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2079   Register Hi = Unmerge.getReg(1);
2080 
2081   // Extract the upper half, since this is where we will find the sign and
2082   // exponent.
2083   auto Exp = extractF64Exponent(Hi, B);
2084 
2085   const unsigned FractBits = 52;
2086 
2087   // Extract the sign bit.
2088   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2089   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2090 
2091   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2092 
2093   const auto Zero32 = B.buildConstant(S32, 0);
2094 
2095   // Extend back to 64-bits.
2096   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2097 
2098   auto Shr = B.buildAShr(S64, FractMask, Exp);
2099   auto Not = B.buildNot(S64, Shr);
2100   auto Tmp0 = B.buildAnd(S64, Src, Not);
2101   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2102 
2103   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2104   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2105 
2106   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2107   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2108   MI.eraseFromParent();
2109   return true;
2110 }
2111 
2112 bool AMDGPULegalizerInfo::legalizeITOFP(
2113   MachineInstr &MI, MachineRegisterInfo &MRI,
2114   MachineIRBuilder &B, bool Signed) const {
2115 
2116   Register Dst = MI.getOperand(0).getReg();
2117   Register Src = MI.getOperand(1).getReg();
2118 
2119   const LLT S64 = LLT::scalar(64);
2120   const LLT S32 = LLT::scalar(32);
2121 
2122   assert(MRI.getType(Src) == S64);
2123 
2124   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2125   auto ThirtyTwo = B.buildConstant(S32, 32);
2126 
2127   if (MRI.getType(Dst) == S64) {
2128     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2129                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2130 
2131     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2132     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2133                      .addUse(CvtHi.getReg(0))
2134                      .addUse(ThirtyTwo.getReg(0));
2135 
2136     // TODO: Should this propagate fast-math-flags?
2137     B.buildFAdd(Dst, LdExp, CvtLo);
2138     MI.eraseFromParent();
2139     return true;
2140   }
2141 
2142   assert(MRI.getType(Dst) == S32);
2143 
2144   auto One = B.buildConstant(S32, 1);
2145 
2146   MachineInstrBuilder ShAmt;
2147   if (Signed) {
2148     auto ThirtyOne = B.buildConstant(S32, 31);
2149     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2150     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2151     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2152     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2153                                /*HasSideEffects=*/false)
2154                   .addUse(Unmerge.getReg(1));
2155     auto LS2 = B.buildSub(S32, LS, One);
2156     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2157   } else
2158     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2159   auto Norm = B.buildShl(S64, Src, ShAmt);
2160   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2161   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2162   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2163   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2164   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2165   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2166                    /*HasSideEffects=*/false)
2167       .addUse(FVal.getReg(0))
2168       .addUse(Scale.getReg(0));
2169   MI.eraseFromParent();
2170   return true;
2171 }
2172 
2173 // TODO: Copied from DAG implementation. Verify logic and document how this
2174 // actually works.
2175 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2176                                         MachineRegisterInfo &MRI,
2177                                         MachineIRBuilder &B,
2178                                         bool Signed) const {
2179 
2180   Register Dst = MI.getOperand(0).getReg();
2181   Register Src = MI.getOperand(1).getReg();
2182 
2183   const LLT S64 = LLT::scalar(64);
2184   const LLT S32 = LLT::scalar(32);
2185 
2186   const LLT SrcLT = MRI.getType(Src);
2187   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2188 
2189   unsigned Flags = MI.getFlags();
2190 
2191   // The basic idea of converting a floating point number into a pair of 32-bit
2192   // integers is illustrated as follows:
2193   //
2194   //     tf := trunc(val);
2195   //    hif := floor(tf * 2^-32);
2196   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2197   //     hi := fptoi(hif);
2198   //     lo := fptoi(lof);
2199   //
2200   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2201   MachineInstrBuilder Sign;
2202   if (Signed && SrcLT == S32) {
2203     // However, a 32-bit floating point number has only 23 bits mantissa and
2204     // it's not enough to hold all the significant bits of `lof` if val is
2205     // negative. To avoid the loss of precision, We need to take the absolute
2206     // value after truncating and flip the result back based on the original
2207     // signedness.
2208     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2209     Trunc = B.buildFAbs(S32, Trunc, Flags);
2210   }
2211   MachineInstrBuilder K0, K1;
2212   if (SrcLT == S64) {
2213     K0 = B.buildFConstant(S64,
2214                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2215     K1 = B.buildFConstant(S64,
2216                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2217   } else {
2218     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2219     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2220   }
2221 
2222   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2223   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2224   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2225 
2226   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2227                                      : B.buildFPTOUI(S32, FloorMul);
2228   auto Lo = B.buildFPTOUI(S32, Fma);
2229 
2230   if (Signed && SrcLT == S32) {
2231     // Flip the result based on the signedness, which is either all 0s or 1s.
2232     Sign = B.buildMerge(S64, {Sign, Sign});
2233     // r := xor({lo, hi}, sign) - sign;
2234     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2235   } else
2236     B.buildMerge(Dst, {Lo, Hi});
2237   MI.eraseFromParent();
2238 
2239   return true;
2240 }
2241 
2242 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2243                                                MachineInstr &MI) const {
2244   MachineFunction &MF = Helper.MIRBuilder.getMF();
2245   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2246 
2247   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2248                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2249 
2250   // With ieee_mode disabled, the instructions have the correct behavior
2251   // already for G_FMINNUM/G_FMAXNUM
2252   if (!MFI->getMode().IEEE)
2253     return !IsIEEEOp;
2254 
2255   if (IsIEEEOp)
2256     return true;
2257 
2258   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2259 }
2260 
2261 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2262   MachineInstr &MI, MachineRegisterInfo &MRI,
2263   MachineIRBuilder &B) const {
2264   // TODO: Should move some of this into LegalizerHelper.
2265 
2266   // TODO: Promote dynamic indexing of s16 to s32
2267 
2268   // FIXME: Artifact combiner probably should have replaced the truncated
2269   // constant before this, so we shouldn't need
2270   // getIConstantVRegValWithLookThrough.
2271   Optional<ValueAndVReg> MaybeIdxVal =
2272       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2273   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2274     return true;
2275   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2276 
2277   Register Dst = MI.getOperand(0).getReg();
2278   Register Vec = MI.getOperand(1).getReg();
2279 
2280   LLT VecTy = MRI.getType(Vec);
2281   LLT EltTy = VecTy.getElementType();
2282   assert(EltTy == MRI.getType(Dst));
2283 
2284   if (IdxVal < VecTy.getNumElements()) {
2285     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2286     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2287   } else {
2288     B.buildUndef(Dst);
2289   }
2290 
2291   MI.eraseFromParent();
2292   return true;
2293 }
2294 
2295 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2296   MachineInstr &MI, MachineRegisterInfo &MRI,
2297   MachineIRBuilder &B) const {
2298   // TODO: Should move some of this into LegalizerHelper.
2299 
2300   // TODO: Promote dynamic indexing of s16 to s32
2301 
2302   // FIXME: Artifact combiner probably should have replaced the truncated
2303   // constant before this, so we shouldn't need
2304   // getIConstantVRegValWithLookThrough.
2305   Optional<ValueAndVReg> MaybeIdxVal =
2306       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2307   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2308     return true;
2309 
2310   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2311   Register Dst = MI.getOperand(0).getReg();
2312   Register Vec = MI.getOperand(1).getReg();
2313   Register Ins = MI.getOperand(2).getReg();
2314 
2315   LLT VecTy = MRI.getType(Vec);
2316   LLT EltTy = VecTy.getElementType();
2317   assert(EltTy == MRI.getType(Ins));
2318   (void)Ins;
2319 
2320   unsigned NumElts = VecTy.getNumElements();
2321   if (IdxVal < NumElts) {
2322     SmallVector<Register, 8> SrcRegs;
2323     for (unsigned i = 0; i < NumElts; ++i)
2324       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2325     B.buildUnmerge(SrcRegs, Vec);
2326 
2327     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2328     B.buildMerge(Dst, SrcRegs);
2329   } else {
2330     B.buildUndef(Dst);
2331   }
2332 
2333   MI.eraseFromParent();
2334   return true;
2335 }
2336 
2337 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2338   MachineInstr &MI, MachineRegisterInfo &MRI,
2339   MachineIRBuilder &B) const {
2340   const LLT V2S16 = LLT::fixed_vector(2, 16);
2341 
2342   Register Dst = MI.getOperand(0).getReg();
2343   Register Src0 = MI.getOperand(1).getReg();
2344   LLT DstTy = MRI.getType(Dst);
2345   LLT SrcTy = MRI.getType(Src0);
2346 
2347   if (SrcTy == V2S16 && DstTy == V2S16 &&
2348       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2349     return true;
2350 
2351   MachineIRBuilder HelperBuilder(MI);
2352   GISelObserverWrapper DummyObserver;
2353   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2354   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2355 }
2356 
2357 bool AMDGPULegalizerInfo::legalizeSinCos(
2358   MachineInstr &MI, MachineRegisterInfo &MRI,
2359   MachineIRBuilder &B) const {
2360 
2361   Register DstReg = MI.getOperand(0).getReg();
2362   Register SrcReg = MI.getOperand(1).getReg();
2363   LLT Ty = MRI.getType(DstReg);
2364   unsigned Flags = MI.getFlags();
2365 
2366   Register TrigVal;
2367   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2368   if (ST.hasTrigReducedRange()) {
2369     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2370     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2371       .addUse(MulVal.getReg(0))
2372       .setMIFlags(Flags).getReg(0);
2373   } else
2374     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2375 
2376   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2377     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2378   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2379     .addUse(TrigVal)
2380     .setMIFlags(Flags);
2381   MI.eraseFromParent();
2382   return true;
2383 }
2384 
2385 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2386                                                   MachineIRBuilder &B,
2387                                                   const GlobalValue *GV,
2388                                                   int64_t Offset,
2389                                                   unsigned GAFlags) const {
2390   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2391   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2392   // to the following code sequence:
2393   //
2394   // For constant address space:
2395   //   s_getpc_b64 s[0:1]
2396   //   s_add_u32 s0, s0, $symbol
2397   //   s_addc_u32 s1, s1, 0
2398   //
2399   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2400   //   a fixup or relocation is emitted to replace $symbol with a literal
2401   //   constant, which is a pc-relative offset from the encoding of the $symbol
2402   //   operand to the global variable.
2403   //
2404   // For global address space:
2405   //   s_getpc_b64 s[0:1]
2406   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2407   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2408   //
2409   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2410   //   fixups or relocations are emitted to replace $symbol@*@lo and
2411   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2412   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2413   //   operand to the global variable.
2414   //
2415   // What we want here is an offset from the value returned by s_getpc
2416   // (which is the address of the s_add_u32 instruction) to the global
2417   // variable, but since the encoding of $symbol starts 4 bytes after the start
2418   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2419   // small. This requires us to add 4 to the global variable offset in order to
2420   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2421   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2422   // instruction.
2423 
2424   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2425 
2426   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2427     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2428 
2429   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2430     .addDef(PCReg);
2431 
2432   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2433   if (GAFlags == SIInstrInfo::MO_NONE)
2434     MIB.addImm(0);
2435   else
2436     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2437 
2438   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2439 
2440   if (PtrTy.getSizeInBits() == 32)
2441     B.buildExtract(DstReg, PCReg, 0);
2442   return true;
2443  }
2444 
2445 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2446   MachineInstr &MI, MachineRegisterInfo &MRI,
2447   MachineIRBuilder &B) const {
2448   Register DstReg = MI.getOperand(0).getReg();
2449   LLT Ty = MRI.getType(DstReg);
2450   unsigned AS = Ty.getAddressSpace();
2451 
2452   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2453   MachineFunction &MF = B.getMF();
2454   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2455 
2456   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2457     if (!MFI->isModuleEntryFunction() &&
2458         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2459       const Function &Fn = MF.getFunction();
2460       DiagnosticInfoUnsupported BadLDSDecl(
2461         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2462         DS_Warning);
2463       Fn.getContext().diagnose(BadLDSDecl);
2464 
2465       // We currently don't have a way to correctly allocate LDS objects that
2466       // aren't directly associated with a kernel. We do force inlining of
2467       // functions that use local objects. However, if these dead functions are
2468       // not eliminated, we don't want a compile time error. Just emit a warning
2469       // and a trap, since there should be no callable path here.
2470       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2471       B.buildUndef(DstReg);
2472       MI.eraseFromParent();
2473       return true;
2474     }
2475 
2476     // TODO: We could emit code to handle the initialization somewhere.
2477     // We ignore the initializer for now and legalize it to allow selection.
2478     // The initializer will anyway get errored out during assembly emission.
2479     const SITargetLowering *TLI = ST.getTargetLowering();
2480     if (!TLI->shouldUseLDSConstAddress(GV)) {
2481       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2482       return true; // Leave in place;
2483     }
2484 
2485     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2486       Type *Ty = GV->getValueType();
2487       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2488       // zero-sized type in other languages to declare the dynamic shared
2489       // memory which size is not known at the compile time. They will be
2490       // allocated by the runtime and placed directly after the static
2491       // allocated ones. They all share the same offset.
2492       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2493         // Adjust alignment for that dynamic shared memory array.
2494         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2495         LLT S32 = LLT::scalar(32);
2496         auto Sz =
2497             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2498         B.buildIntToPtr(DstReg, Sz);
2499         MI.eraseFromParent();
2500         return true;
2501       }
2502     }
2503 
2504     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2505                                                    *cast<GlobalVariable>(GV)));
2506     MI.eraseFromParent();
2507     return true;
2508   }
2509 
2510   const SITargetLowering *TLI = ST.getTargetLowering();
2511 
2512   if (TLI->shouldEmitFixup(GV)) {
2513     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2514     MI.eraseFromParent();
2515     return true;
2516   }
2517 
2518   if (TLI->shouldEmitPCReloc(GV)) {
2519     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2520     MI.eraseFromParent();
2521     return true;
2522   }
2523 
2524   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2525   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2526 
2527   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2528   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2529       MachinePointerInfo::getGOT(MF),
2530       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2531           MachineMemOperand::MOInvariant,
2532       LoadTy, Align(8));
2533 
2534   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2535 
2536   if (Ty.getSizeInBits() == 32) {
2537     // Truncate if this is a 32-bit constant address.
2538     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2539     B.buildExtract(DstReg, Load, 0);
2540   } else
2541     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2542 
2543   MI.eraseFromParent();
2544   return true;
2545 }
2546 
2547 static LLT widenToNextPowerOf2(LLT Ty) {
2548   if (Ty.isVector())
2549     return Ty.changeElementCount(
2550         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2551   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2552 }
2553 
2554 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2555                                        MachineInstr &MI) const {
2556   MachineIRBuilder &B = Helper.MIRBuilder;
2557   MachineRegisterInfo &MRI = *B.getMRI();
2558   GISelChangeObserver &Observer = Helper.Observer;
2559 
2560   Register PtrReg = MI.getOperand(1).getReg();
2561   LLT PtrTy = MRI.getType(PtrReg);
2562   unsigned AddrSpace = PtrTy.getAddressSpace();
2563 
2564   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2565     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2566     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2567     Observer.changingInstr(MI);
2568     MI.getOperand(1).setReg(Cast.getReg(0));
2569     Observer.changedInstr(MI);
2570     return true;
2571   }
2572 
2573   if (MI.getOpcode() != AMDGPU::G_LOAD)
2574     return false;
2575 
2576   Register ValReg = MI.getOperand(0).getReg();
2577   LLT ValTy = MRI.getType(ValReg);
2578 
2579   MachineMemOperand *MMO = *MI.memoperands_begin();
2580   const unsigned ValSize = ValTy.getSizeInBits();
2581   const LLT MemTy = MMO->getMemoryType();
2582   const Align MemAlign = MMO->getAlign();
2583   const unsigned MemSize = MemTy.getSizeInBits();
2584   const uint64_t AlignInBits = 8 * MemAlign.value();
2585 
2586   // Widen non-power-of-2 loads to the alignment if needed
2587   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2588     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2589 
2590     // This was already the correct extending load result type, so just adjust
2591     // the memory type.
2592     if (WideMemSize == ValSize) {
2593       MachineFunction &MF = B.getMF();
2594 
2595       MachineMemOperand *WideMMO =
2596           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2597       Observer.changingInstr(MI);
2598       MI.setMemRefs(MF, {WideMMO});
2599       Observer.changedInstr(MI);
2600       return true;
2601     }
2602 
2603     // Don't bother handling edge case that should probably never be produced.
2604     if (ValSize > WideMemSize)
2605       return false;
2606 
2607     LLT WideTy = widenToNextPowerOf2(ValTy);
2608 
2609     Register WideLoad;
2610     if (!WideTy.isVector()) {
2611       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2612       B.buildTrunc(ValReg, WideLoad).getReg(0);
2613     } else {
2614       // Extract the subvector.
2615 
2616       if (isRegisterType(ValTy)) {
2617         // If this a case where G_EXTRACT is legal, use it.
2618         // (e.g. <3 x s32> -> <4 x s32>)
2619         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2620         B.buildExtract(ValReg, WideLoad, 0);
2621       } else {
2622         // For cases where the widened type isn't a nice register value, unmerge
2623         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2624         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2625         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2626       }
2627     }
2628 
2629     MI.eraseFromParent();
2630     return true;
2631   }
2632 
2633   return false;
2634 }
2635 
2636 bool AMDGPULegalizerInfo::legalizeFMad(
2637   MachineInstr &MI, MachineRegisterInfo &MRI,
2638   MachineIRBuilder &B) const {
2639   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2640   assert(Ty.isScalar());
2641 
2642   MachineFunction &MF = B.getMF();
2643   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2644 
2645   // TODO: Always legal with future ftz flag.
2646   // FIXME: Do we need just output?
2647   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2648     return true;
2649   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2650     return true;
2651 
2652   MachineIRBuilder HelperBuilder(MI);
2653   GISelObserverWrapper DummyObserver;
2654   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2655   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2656 }
2657 
2658 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2659   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2660   Register DstReg = MI.getOperand(0).getReg();
2661   Register PtrReg = MI.getOperand(1).getReg();
2662   Register CmpVal = MI.getOperand(2).getReg();
2663   Register NewVal = MI.getOperand(3).getReg();
2664 
2665   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2666          "this should not have been custom lowered");
2667 
2668   LLT ValTy = MRI.getType(CmpVal);
2669   LLT VecTy = LLT::fixed_vector(2, ValTy);
2670 
2671   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2672 
2673   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2674     .addDef(DstReg)
2675     .addUse(PtrReg)
2676     .addUse(PackedVal)
2677     .setMemRefs(MI.memoperands());
2678 
2679   MI.eraseFromParent();
2680   return true;
2681 }
2682 
2683 bool AMDGPULegalizerInfo::legalizeFlog(
2684   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2685   Register Dst = MI.getOperand(0).getReg();
2686   Register Src = MI.getOperand(1).getReg();
2687   LLT Ty = B.getMRI()->getType(Dst);
2688   unsigned Flags = MI.getFlags();
2689 
2690   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2691   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2692 
2693   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2694   MI.eraseFromParent();
2695   return true;
2696 }
2697 
2698 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2699                                        MachineIRBuilder &B) const {
2700   Register Dst = MI.getOperand(0).getReg();
2701   Register Src = MI.getOperand(1).getReg();
2702   unsigned Flags = MI.getFlags();
2703   LLT Ty = B.getMRI()->getType(Dst);
2704 
2705   auto K = B.buildFConstant(Ty, numbers::log2e);
2706   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2707   B.buildFExp2(Dst, Mul, Flags);
2708   MI.eraseFromParent();
2709   return true;
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2713                                        MachineIRBuilder &B) const {
2714   Register Dst = MI.getOperand(0).getReg();
2715   Register Src0 = MI.getOperand(1).getReg();
2716   Register Src1 = MI.getOperand(2).getReg();
2717   unsigned Flags = MI.getFlags();
2718   LLT Ty = B.getMRI()->getType(Dst);
2719   const LLT S16 = LLT::scalar(16);
2720   const LLT S32 = LLT::scalar(32);
2721 
2722   if (Ty == S32) {
2723     auto Log = B.buildFLog2(S32, Src0, Flags);
2724     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2725       .addUse(Log.getReg(0))
2726       .addUse(Src1)
2727       .setMIFlags(Flags);
2728     B.buildFExp2(Dst, Mul, Flags);
2729   } else if (Ty == S16) {
2730     // There's no f16 fmul_legacy, so we need to convert for it.
2731     auto Log = B.buildFLog2(S16, Src0, Flags);
2732     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2733     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2734     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2735       .addUse(Ext0.getReg(0))
2736       .addUse(Ext1.getReg(0))
2737       .setMIFlags(Flags);
2738 
2739     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2740   } else
2741     return false;
2742 
2743   MI.eraseFromParent();
2744   return true;
2745 }
2746 
2747 // Find a source register, ignoring any possible source modifiers.
2748 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2749   Register ModSrc = OrigSrc;
2750   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2751     ModSrc = SrcFNeg->getOperand(1).getReg();
2752     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2753       ModSrc = SrcFAbs->getOperand(1).getReg();
2754   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2755     ModSrc = SrcFAbs->getOperand(1).getReg();
2756   return ModSrc;
2757 }
2758 
2759 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2760                                          MachineRegisterInfo &MRI,
2761                                          MachineIRBuilder &B) const {
2762 
2763   const LLT S1 = LLT::scalar(1);
2764   const LLT S64 = LLT::scalar(64);
2765   Register Dst = MI.getOperand(0).getReg();
2766   Register OrigSrc = MI.getOperand(1).getReg();
2767   unsigned Flags = MI.getFlags();
2768   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2769          "this should not have been custom lowered");
2770 
2771   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2772   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2773   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2774   // V_FRACT bug is:
2775   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2776   //
2777   // Convert floor(x) to (x - fract(x))
2778 
2779   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2780     .addUse(OrigSrc)
2781     .setMIFlags(Flags);
2782 
2783   // Give source modifier matching some assistance before obscuring a foldable
2784   // pattern.
2785 
2786   // TODO: We can avoid the neg on the fract? The input sign to fract
2787   // shouldn't matter?
2788   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2789 
2790   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2791 
2792   Register Min = MRI.createGenericVirtualRegister(S64);
2793 
2794   // We don't need to concern ourselves with the snan handling difference, so
2795   // use the one which will directly select.
2796   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2797   if (MFI->getMode().IEEE)
2798     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2799   else
2800     B.buildFMinNum(Min, Fract, Const, Flags);
2801 
2802   Register CorrectedFract = Min;
2803   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2804     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2805     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2806   }
2807 
2808   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2809   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2810 
2811   MI.eraseFromParent();
2812   return true;
2813 }
2814 
2815 // Turn an illegal packed v2s16 build vector into bit operations.
2816 // TODO: This should probably be a bitcast action in LegalizerHelper.
2817 bool AMDGPULegalizerInfo::legalizeBuildVector(
2818   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2819   Register Dst = MI.getOperand(0).getReg();
2820   const LLT S32 = LLT::scalar(32);
2821   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2822 
2823   Register Src0 = MI.getOperand(1).getReg();
2824   Register Src1 = MI.getOperand(2).getReg();
2825   assert(MRI.getType(Src0) == LLT::scalar(16));
2826 
2827   auto Merge = B.buildMerge(S32, {Src0, Src1});
2828   B.buildBitcast(Dst, Merge);
2829 
2830   MI.eraseFromParent();
2831   return true;
2832 }
2833 
2834 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2835 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2836 // case with a single min instruction instead of a compare+select.
2837 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2838                                             MachineRegisterInfo &MRI,
2839                                             MachineIRBuilder &B) const {
2840   Register Dst = MI.getOperand(0).getReg();
2841   Register Src = MI.getOperand(1).getReg();
2842   LLT DstTy = MRI.getType(Dst);
2843   LLT SrcTy = MRI.getType(Src);
2844 
2845   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2846                         ? AMDGPU::G_AMDGPU_FFBH_U32
2847                         : AMDGPU::G_AMDGPU_FFBL_B32;
2848   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2849   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2850 
2851   MI.eraseFromParent();
2852   return true;
2853 }
2854 
2855 // Check that this is a G_XOR x, -1
2856 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2857   if (MI.getOpcode() != TargetOpcode::G_XOR)
2858     return false;
2859   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2860   return ConstVal && *ConstVal == -1;
2861 }
2862 
2863 // Return the use branch instruction, otherwise null if the usage is invalid.
2864 static MachineInstr *
2865 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2866                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2867   Register CondDef = MI.getOperand(0).getReg();
2868   if (!MRI.hasOneNonDBGUse(CondDef))
2869     return nullptr;
2870 
2871   MachineBasicBlock *Parent = MI.getParent();
2872   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2873 
2874   if (isNot(MRI, *UseMI)) {
2875     Register NegatedCond = UseMI->getOperand(0).getReg();
2876     if (!MRI.hasOneNonDBGUse(NegatedCond))
2877       return nullptr;
2878 
2879     // We're deleting the def of this value, so we need to remove it.
2880     eraseInstr(*UseMI, MRI);
2881 
2882     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2883     Negated = true;
2884   }
2885 
2886   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2887     return nullptr;
2888 
2889   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2890   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2891   if (Next == Parent->end()) {
2892     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2893     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2894       return nullptr;
2895     UncondBrTarget = &*NextMBB;
2896   } else {
2897     if (Next->getOpcode() != AMDGPU::G_BR)
2898       return nullptr;
2899     Br = &*Next;
2900     UncondBrTarget = Br->getOperand(0).getMBB();
2901   }
2902 
2903   return UseMI;
2904 }
2905 
2906 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2907                                          const ArgDescriptor *Arg,
2908                                          const TargetRegisterClass *ArgRC,
2909                                          LLT ArgTy) const {
2910   MCRegister SrcReg = Arg->getRegister();
2911   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2912   assert(DstReg.isVirtual() && "Virtual register expected");
2913 
2914   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2915                                              *ArgRC, B.getDebugLoc(), ArgTy);
2916   if (Arg->isMasked()) {
2917     // TODO: Should we try to emit this once in the entry block?
2918     const LLT S32 = LLT::scalar(32);
2919     const unsigned Mask = Arg->getMask();
2920     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2921 
2922     Register AndMaskSrc = LiveIn;
2923 
2924     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2925     // 0.
2926     if (Shift != 0) {
2927       auto ShiftAmt = B.buildConstant(S32, Shift);
2928       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2929     }
2930 
2931     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2932   } else {
2933     B.buildCopy(DstReg, LiveIn);
2934   }
2935 
2936   return true;
2937 }
2938 
2939 bool AMDGPULegalizerInfo::loadInputValue(
2940     Register DstReg, MachineIRBuilder &B,
2941     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2942   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2943   const ArgDescriptor *Arg;
2944   const TargetRegisterClass *ArgRC;
2945   LLT ArgTy;
2946   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2947 
2948   if (!Arg) {
2949     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2950       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2951       // case the pointer argument may be missing and we use null.
2952       B.buildConstant(DstReg, 0);
2953       return true;
2954     }
2955 
2956     // It's undefined behavior if a function marked with the amdgpu-no-*
2957     // attributes uses the corresponding intrinsic.
2958     B.buildUndef(DstReg);
2959     return true;
2960   }
2961 
2962   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2963     return false; // TODO: Handle these
2964   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2965 }
2966 
2967 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2968     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2969     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2970   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2971     return false;
2972 
2973   MI.eraseFromParent();
2974   return true;
2975 }
2976 
2977 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2978                                        MachineRegisterInfo &MRI,
2979                                        MachineIRBuilder &B) const {
2980   Register Dst = MI.getOperand(0).getReg();
2981   LLT DstTy = MRI.getType(Dst);
2982   LLT S16 = LLT::scalar(16);
2983   LLT S32 = LLT::scalar(32);
2984   LLT S64 = LLT::scalar(64);
2985 
2986   if (DstTy == S16)
2987     return legalizeFDIV16(MI, MRI, B);
2988   if (DstTy == S32)
2989     return legalizeFDIV32(MI, MRI, B);
2990   if (DstTy == S64)
2991     return legalizeFDIV64(MI, MRI, B);
2992 
2993   return false;
2994 }
2995 
2996 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2997                                                         Register DstDivReg,
2998                                                         Register DstRemReg,
2999                                                         Register X,
3000                                                         Register Y) const {
3001   const LLT S1 = LLT::scalar(1);
3002   const LLT S32 = LLT::scalar(32);
3003 
3004   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3005   // algorithm used here.
3006 
3007   // Initial estimate of inv(y).
3008   auto FloatY = B.buildUITOFP(S32, Y);
3009   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3010   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3011   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3012   auto Z = B.buildFPTOUI(S32, ScaledY);
3013 
3014   // One round of UNR.
3015   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3016   auto NegYZ = B.buildMul(S32, NegY, Z);
3017   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3018 
3019   // Quotient/remainder estimate.
3020   auto Q = B.buildUMulH(S32, X, Z);
3021   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3022 
3023   // First quotient/remainder refinement.
3024   auto One = B.buildConstant(S32, 1);
3025   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3026   if (DstDivReg)
3027     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3028   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3029 
3030   // Second quotient/remainder refinement.
3031   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3032   if (DstDivReg)
3033     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3034 
3035   if (DstRemReg)
3036     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3037 }
3038 
3039 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3040 //
3041 // Return lo, hi of result
3042 //
3043 // %cvt.lo = G_UITOFP Val.lo
3044 // %cvt.hi = G_UITOFP Val.hi
3045 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3046 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3047 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3048 // %mul2 = G_FMUL %mul1, 2**(-32)
3049 // %trunc = G_INTRINSIC_TRUNC %mul2
3050 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3051 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3052 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3053                                                        Register Val) {
3054   const LLT S32 = LLT::scalar(32);
3055   auto Unmerge = B.buildUnmerge(S32, Val);
3056 
3057   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3058   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3059 
3060   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3061                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3062 
3063   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3064   auto Mul1 =
3065       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3066 
3067   // 2**(-32)
3068   auto Mul2 =
3069       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3070   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3071 
3072   // -(2**32)
3073   auto Mad2 = B.buildFMAD(S32, Trunc,
3074                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3075 
3076   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3077   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3078 
3079   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3080 }
3081 
3082 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3083                                                         Register DstDivReg,
3084                                                         Register DstRemReg,
3085                                                         Register Numer,
3086                                                         Register Denom) const {
3087   const LLT S32 = LLT::scalar(32);
3088   const LLT S64 = LLT::scalar(64);
3089   const LLT S1 = LLT::scalar(1);
3090   Register RcpLo, RcpHi;
3091 
3092   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3093 
3094   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3095 
3096   auto Zero64 = B.buildConstant(S64, 0);
3097   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3098 
3099   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3100   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3101 
3102   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3103   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3104   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3105 
3106   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3107   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3108   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3109 
3110   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3111   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3112   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3113   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3114   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3115 
3116   auto Zero32 = B.buildConstant(S32, 0);
3117   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3118   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3119   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3120 
3121   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3122   Register NumerLo = UnmergeNumer.getReg(0);
3123   Register NumerHi = UnmergeNumer.getReg(1);
3124 
3125   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3126   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3127   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3128   Register Mul3_Lo = UnmergeMul3.getReg(0);
3129   Register Mul3_Hi = UnmergeMul3.getReg(1);
3130   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3131   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3132   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3133   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3134 
3135   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3136   Register DenomLo = UnmergeDenom.getReg(0);
3137   Register DenomHi = UnmergeDenom.getReg(1);
3138 
3139   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3140   auto C1 = B.buildSExt(S32, CmpHi);
3141 
3142   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3143   auto C2 = B.buildSExt(S32, CmpLo);
3144 
3145   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3146   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3147 
3148   // TODO: Here and below portions of the code can be enclosed into if/endif.
3149   // Currently control flow is unconditional and we have 4 selects after
3150   // potential endif to substitute PHIs.
3151 
3152   // if C3 != 0 ...
3153   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3154   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3155   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3156   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3157 
3158   auto One64 = B.buildConstant(S64, 1);
3159   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3160 
3161   auto C4 =
3162       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3163   auto C5 =
3164       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3165   auto C6 = B.buildSelect(
3166       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3167 
3168   // if (C6 != 0)
3169   auto Add4 = B.buildAdd(S64, Add3, One64);
3170   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3171 
3172   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3173   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3174   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3175 
3176   // endif C6
3177   // endif C3
3178 
3179   if (DstDivReg) {
3180     auto Sel1 = B.buildSelect(
3181         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3182     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3183                   Sel1, MulHi3);
3184   }
3185 
3186   if (DstRemReg) {
3187     auto Sel2 = B.buildSelect(
3188         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3189     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3190                   Sel2, Sub1);
3191   }
3192 }
3193 
3194 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3195                                                   MachineRegisterInfo &MRI,
3196                                                   MachineIRBuilder &B) const {
3197   Register DstDivReg, DstRemReg;
3198   switch (MI.getOpcode()) {
3199   default:
3200     llvm_unreachable("Unexpected opcode!");
3201   case AMDGPU::G_UDIV: {
3202     DstDivReg = MI.getOperand(0).getReg();
3203     break;
3204   }
3205   case AMDGPU::G_UREM: {
3206     DstRemReg = MI.getOperand(0).getReg();
3207     break;
3208   }
3209   case AMDGPU::G_UDIVREM: {
3210     DstDivReg = MI.getOperand(0).getReg();
3211     DstRemReg = MI.getOperand(1).getReg();
3212     break;
3213   }
3214   }
3215 
3216   const LLT S64 = LLT::scalar(64);
3217   const LLT S32 = LLT::scalar(32);
3218   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3219   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3220   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3221   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3222 
3223   if (Ty == S32)
3224     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3225   else if (Ty == S64)
3226     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3227   else
3228     return false;
3229 
3230   MI.eraseFromParent();
3231   return true;
3232 }
3233 
3234 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3235                                                 MachineRegisterInfo &MRI,
3236                                                 MachineIRBuilder &B) const {
3237   const LLT S64 = LLT::scalar(64);
3238   const LLT S32 = LLT::scalar(32);
3239 
3240   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3241   if (Ty != S32 && Ty != S64)
3242     return false;
3243 
3244   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3245   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3246   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3247 
3248   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3249   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3250   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3251 
3252   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3253   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3254 
3255   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3256   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3257 
3258   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3259   switch (MI.getOpcode()) {
3260   default:
3261     llvm_unreachable("Unexpected opcode!");
3262   case AMDGPU::G_SDIV: {
3263     DstDivReg = MI.getOperand(0).getReg();
3264     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3265     break;
3266   }
3267   case AMDGPU::G_SREM: {
3268     DstRemReg = MI.getOperand(0).getReg();
3269     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3270     break;
3271   }
3272   case AMDGPU::G_SDIVREM: {
3273     DstDivReg = MI.getOperand(0).getReg();
3274     DstRemReg = MI.getOperand(1).getReg();
3275     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3276     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3277     break;
3278   }
3279   }
3280 
3281   if (Ty == S32)
3282     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3283   else
3284     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3285 
3286   if (DstDivReg) {
3287     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3288     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3289     B.buildSub(DstDivReg, SignXor, Sign);
3290   }
3291 
3292   if (DstRemReg) {
3293     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3294     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3295     B.buildSub(DstRemReg, SignXor, Sign);
3296   }
3297 
3298   MI.eraseFromParent();
3299   return true;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3303                                                  MachineRegisterInfo &MRI,
3304                                                  MachineIRBuilder &B) const {
3305   Register Res = MI.getOperand(0).getReg();
3306   Register LHS = MI.getOperand(1).getReg();
3307   Register RHS = MI.getOperand(2).getReg();
3308   uint16_t Flags = MI.getFlags();
3309   LLT ResTy = MRI.getType(Res);
3310 
3311   const MachineFunction &MF = B.getMF();
3312   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3313                             MI.getFlag(MachineInstr::FmAfn);
3314 
3315   if (!AllowInaccurateRcp)
3316     return false;
3317 
3318   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3319     // 1 / x -> RCP(x)
3320     if (CLHS->isExactlyValue(1.0)) {
3321       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3322         .addUse(RHS)
3323         .setMIFlags(Flags);
3324 
3325       MI.eraseFromParent();
3326       return true;
3327     }
3328 
3329     // -1 / x -> RCP( FNEG(x) )
3330     if (CLHS->isExactlyValue(-1.0)) {
3331       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3332       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3333         .addUse(FNeg.getReg(0))
3334         .setMIFlags(Flags);
3335 
3336       MI.eraseFromParent();
3337       return true;
3338     }
3339   }
3340 
3341   // x / y -> x * (1.0 / y)
3342   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3343     .addUse(RHS)
3344     .setMIFlags(Flags);
3345   B.buildFMul(Res, LHS, RCP, Flags);
3346 
3347   MI.eraseFromParent();
3348   return true;
3349 }
3350 
3351 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3352                                                    MachineRegisterInfo &MRI,
3353                                                    MachineIRBuilder &B) const {
3354   Register Res = MI.getOperand(0).getReg();
3355   Register X = MI.getOperand(1).getReg();
3356   Register Y = MI.getOperand(2).getReg();
3357   uint16_t Flags = MI.getFlags();
3358   LLT ResTy = MRI.getType(Res);
3359 
3360   const MachineFunction &MF = B.getMF();
3361   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3362                             MI.getFlag(MachineInstr::FmAfn);
3363 
3364   if (!AllowInaccurateRcp)
3365     return false;
3366 
3367   auto NegY = B.buildFNeg(ResTy, Y);
3368   auto One = B.buildFConstant(ResTy, 1.0);
3369 
3370   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3371     .addUse(Y)
3372     .setMIFlags(Flags);
3373 
3374   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3375   R = B.buildFMA(ResTy, Tmp0, R, R);
3376 
3377   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3378   R = B.buildFMA(ResTy, Tmp1, R, R);
3379 
3380   auto Ret = B.buildFMul(ResTy, X, R);
3381   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3382 
3383   B.buildFMA(Res, Tmp2, R, Ret);
3384   MI.eraseFromParent();
3385   return true;
3386 }
3387 
3388 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3389                                          MachineRegisterInfo &MRI,
3390                                          MachineIRBuilder &B) const {
3391   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3392     return true;
3393 
3394   Register Res = MI.getOperand(0).getReg();
3395   Register LHS = MI.getOperand(1).getReg();
3396   Register RHS = MI.getOperand(2).getReg();
3397 
3398   uint16_t Flags = MI.getFlags();
3399 
3400   LLT S16 = LLT::scalar(16);
3401   LLT S32 = LLT::scalar(32);
3402 
3403   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3404   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3405 
3406   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3407     .addUse(RHSExt.getReg(0))
3408     .setMIFlags(Flags);
3409 
3410   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3411   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3412 
3413   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3414     .addUse(RDst.getReg(0))
3415     .addUse(RHS)
3416     .addUse(LHS)
3417     .setMIFlags(Flags);
3418 
3419   MI.eraseFromParent();
3420   return true;
3421 }
3422 
3423 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3424 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3425 static void toggleSPDenormMode(bool Enable,
3426                                MachineIRBuilder &B,
3427                                const GCNSubtarget &ST,
3428                                AMDGPU::SIModeRegisterDefaults Mode) {
3429   // Set SP denorm mode to this value.
3430   unsigned SPDenormMode =
3431     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3432 
3433   if (ST.hasDenormModeInst()) {
3434     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3435     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3436 
3437     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3438     B.buildInstr(AMDGPU::S_DENORM_MODE)
3439       .addImm(NewDenormModeValue);
3440 
3441   } else {
3442     // Select FP32 bit field in mode register.
3443     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3444                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3445                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3446 
3447     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3448       .addImm(SPDenormMode)
3449       .addImm(SPDenormModeBitField);
3450   }
3451 }
3452 
3453 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3454                                          MachineRegisterInfo &MRI,
3455                                          MachineIRBuilder &B) const {
3456   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3457     return true;
3458 
3459   Register Res = MI.getOperand(0).getReg();
3460   Register LHS = MI.getOperand(1).getReg();
3461   Register RHS = MI.getOperand(2).getReg();
3462   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3463   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3464 
3465   uint16_t Flags = MI.getFlags();
3466 
3467   LLT S32 = LLT::scalar(32);
3468   LLT S1 = LLT::scalar(1);
3469 
3470   auto One = B.buildFConstant(S32, 1.0f);
3471 
3472   auto DenominatorScaled =
3473     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3474       .addUse(LHS)
3475       .addUse(RHS)
3476       .addImm(0)
3477       .setMIFlags(Flags);
3478   auto NumeratorScaled =
3479     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3480       .addUse(LHS)
3481       .addUse(RHS)
3482       .addImm(1)
3483       .setMIFlags(Flags);
3484 
3485   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3486     .addUse(DenominatorScaled.getReg(0))
3487     .setMIFlags(Flags);
3488   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3489 
3490   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3491   // aren't modeled as reading it.
3492   if (!Mode.allFP32Denormals())
3493     toggleSPDenormMode(true, B, ST, Mode);
3494 
3495   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3496   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3497   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3498   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3499   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3500   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3501 
3502   if (!Mode.allFP32Denormals())
3503     toggleSPDenormMode(false, B, ST, Mode);
3504 
3505   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3506     .addUse(Fma4.getReg(0))
3507     .addUse(Fma1.getReg(0))
3508     .addUse(Fma3.getReg(0))
3509     .addUse(NumeratorScaled.getReg(1))
3510     .setMIFlags(Flags);
3511 
3512   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3513     .addUse(Fmas.getReg(0))
3514     .addUse(RHS)
3515     .addUse(LHS)
3516     .setMIFlags(Flags);
3517 
3518   MI.eraseFromParent();
3519   return true;
3520 }
3521 
3522 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3523                                          MachineRegisterInfo &MRI,
3524                                          MachineIRBuilder &B) const {
3525   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3526     return true;
3527 
3528   Register Res = MI.getOperand(0).getReg();
3529   Register LHS = MI.getOperand(1).getReg();
3530   Register RHS = MI.getOperand(2).getReg();
3531 
3532   uint16_t Flags = MI.getFlags();
3533 
3534   LLT S64 = LLT::scalar(64);
3535   LLT S1 = LLT::scalar(1);
3536 
3537   auto One = B.buildFConstant(S64, 1.0);
3538 
3539   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3540     .addUse(LHS)
3541     .addUse(RHS)
3542     .addImm(0)
3543     .setMIFlags(Flags);
3544 
3545   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3546 
3547   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3548     .addUse(DivScale0.getReg(0))
3549     .setMIFlags(Flags);
3550 
3551   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3552   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3553   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3554 
3555   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3556     .addUse(LHS)
3557     .addUse(RHS)
3558     .addImm(1)
3559     .setMIFlags(Flags);
3560 
3561   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3562   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3563   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3564 
3565   Register Scale;
3566   if (!ST.hasUsableDivScaleConditionOutput()) {
3567     // Workaround a hardware bug on SI where the condition output from div_scale
3568     // is not usable.
3569 
3570     LLT S32 = LLT::scalar(32);
3571 
3572     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3573     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3574     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3575     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3576 
3577     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3578                               Scale1Unmerge.getReg(1));
3579     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3580                               Scale0Unmerge.getReg(1));
3581     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3582   } else {
3583     Scale = DivScale1.getReg(1);
3584   }
3585 
3586   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3587     .addUse(Fma4.getReg(0))
3588     .addUse(Fma3.getReg(0))
3589     .addUse(Mul.getReg(0))
3590     .addUse(Scale)
3591     .setMIFlags(Flags);
3592 
3593   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3594     .addUse(Fmas.getReg(0))
3595     .addUse(RHS)
3596     .addUse(LHS)
3597     .setMIFlags(Flags);
3598 
3599   MI.eraseFromParent();
3600   return true;
3601 }
3602 
3603 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3604                                                  MachineRegisterInfo &MRI,
3605                                                  MachineIRBuilder &B) const {
3606   Register Res = MI.getOperand(0).getReg();
3607   Register LHS = MI.getOperand(2).getReg();
3608   Register RHS = MI.getOperand(3).getReg();
3609   uint16_t Flags = MI.getFlags();
3610 
3611   LLT S32 = LLT::scalar(32);
3612   LLT S1 = LLT::scalar(1);
3613 
3614   auto Abs = B.buildFAbs(S32, RHS, Flags);
3615   const APFloat C0Val(1.0f);
3616 
3617   auto C0 = B.buildConstant(S32, 0x6f800000);
3618   auto C1 = B.buildConstant(S32, 0x2f800000);
3619   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3620 
3621   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3622   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3623 
3624   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3625 
3626   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3627     .addUse(Mul0.getReg(0))
3628     .setMIFlags(Flags);
3629 
3630   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3631 
3632   B.buildFMul(Res, Sel, Mul1, Flags);
3633 
3634   MI.eraseFromParent();
3635   return true;
3636 }
3637 
3638 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3639 // FIXME: Why do we handle this one but not other removed instructions?
3640 //
3641 // Reciprocal square root.  The clamp prevents infinite results, clamping
3642 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3643 // +-max_float.
3644 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3645                                                     MachineRegisterInfo &MRI,
3646                                                     MachineIRBuilder &B) const {
3647   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3648     return true;
3649 
3650   Register Dst = MI.getOperand(0).getReg();
3651   Register Src = MI.getOperand(2).getReg();
3652   auto Flags = MI.getFlags();
3653 
3654   LLT Ty = MRI.getType(Dst);
3655 
3656   const fltSemantics *FltSemantics;
3657   if (Ty == LLT::scalar(32))
3658     FltSemantics = &APFloat::IEEEsingle();
3659   else if (Ty == LLT::scalar(64))
3660     FltSemantics = &APFloat::IEEEdouble();
3661   else
3662     return false;
3663 
3664   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3665     .addUse(Src)
3666     .setMIFlags(Flags);
3667 
3668   // We don't need to concern ourselves with the snan handling difference, since
3669   // the rsq quieted (or not) so use the one which will directly select.
3670   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3671   const bool UseIEEE = MFI->getMode().IEEE;
3672 
3673   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3674   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3675                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3676 
3677   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3678 
3679   if (UseIEEE)
3680     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3681   else
3682     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3683   MI.eraseFromParent();
3684   return true;
3685 }
3686 
3687 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3688   switch (IID) {
3689   case Intrinsic::amdgcn_ds_fadd:
3690     return AMDGPU::G_ATOMICRMW_FADD;
3691   case Intrinsic::amdgcn_ds_fmin:
3692     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3693   case Intrinsic::amdgcn_ds_fmax:
3694     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3695   default:
3696     llvm_unreachable("not a DS FP intrinsic");
3697   }
3698 }
3699 
3700 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3701                                                       MachineInstr &MI,
3702                                                       Intrinsic::ID IID) const {
3703   GISelChangeObserver &Observer = Helper.Observer;
3704   Observer.changingInstr(MI);
3705 
3706   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3707 
3708   // The remaining operands were used to set fields in the MemOperand on
3709   // construction.
3710   for (int I = 6; I > 3; --I)
3711     MI.RemoveOperand(I);
3712 
3713   MI.RemoveOperand(1); // Remove the intrinsic ID.
3714   Observer.changedInstr(MI);
3715   return true;
3716 }
3717 
3718 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3719                                             MachineRegisterInfo &MRI,
3720                                             MachineIRBuilder &B) const {
3721   uint64_t Offset =
3722     ST.getTargetLowering()->getImplicitParameterOffset(
3723       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3724   LLT DstTy = MRI.getType(DstReg);
3725   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3726 
3727   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3728   if (!loadInputValue(KernargPtrReg, B,
3729                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3730     return false;
3731 
3732   // FIXME: This should be nuw
3733   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3734   return true;
3735 }
3736 
3737 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3738                                                  MachineRegisterInfo &MRI,
3739                                                  MachineIRBuilder &B) const {
3740   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3741   if (!MFI->isEntryFunction()) {
3742     return legalizePreloadedArgIntrin(MI, MRI, B,
3743                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3744   }
3745 
3746   Register DstReg = MI.getOperand(0).getReg();
3747   if (!getImplicitArgPtr(DstReg, MRI, B))
3748     return false;
3749 
3750   MI.eraseFromParent();
3751   return true;
3752 }
3753 
3754 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3755                                               MachineRegisterInfo &MRI,
3756                                               MachineIRBuilder &B,
3757                                               unsigned AddrSpace) const {
3758   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3759   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3760   Register Hi32 = Unmerge.getReg(1);
3761 
3762   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3763   MI.eraseFromParent();
3764   return true;
3765 }
3766 
3767 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3768 // offset (the offset that is included in bounds checking and swizzling, to be
3769 // split between the instruction's voffset and immoffset fields) and soffset
3770 // (the offset that is excluded from bounds checking and swizzling, to go in
3771 // the instruction's soffset field).  This function takes the first kind of
3772 // offset and figures out how to split it between voffset and immoffset.
3773 std::pair<Register, unsigned>
3774 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3775                                         Register OrigOffset) const {
3776   const unsigned MaxImm = 4095;
3777   Register BaseReg;
3778   unsigned ImmOffset;
3779   const LLT S32 = LLT::scalar(32);
3780   MachineRegisterInfo &MRI = *B.getMRI();
3781 
3782   std::tie(BaseReg, ImmOffset) =
3783       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3784 
3785   // If BaseReg is a pointer, convert it to int.
3786   if (MRI.getType(BaseReg).isPointer())
3787     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3788 
3789   // If the immediate value is too big for the immoffset field, put the value
3790   // and -4096 into the immoffset field so that the value that is copied/added
3791   // for the voffset field is a multiple of 4096, and it stands more chance
3792   // of being CSEd with the copy/add for another similar load/store.
3793   // However, do not do that rounding down to a multiple of 4096 if that is a
3794   // negative number, as it appears to be illegal to have a negative offset
3795   // in the vgpr, even if adding the immediate offset makes it positive.
3796   unsigned Overflow = ImmOffset & ~MaxImm;
3797   ImmOffset -= Overflow;
3798   if ((int32_t)Overflow < 0) {
3799     Overflow += ImmOffset;
3800     ImmOffset = 0;
3801   }
3802 
3803   if (Overflow != 0) {
3804     if (!BaseReg) {
3805       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3806     } else {
3807       auto OverflowVal = B.buildConstant(S32, Overflow);
3808       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3809     }
3810   }
3811 
3812   if (!BaseReg)
3813     BaseReg = B.buildConstant(S32, 0).getReg(0);
3814 
3815   return std::make_pair(BaseReg, ImmOffset);
3816 }
3817 
3818 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3819 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3820                                           Register VOffset, Register SOffset,
3821                                           unsigned ImmOffset, Register VIndex,
3822                                           MachineRegisterInfo &MRI) const {
3823   Optional<ValueAndVReg> MaybeVOffsetVal =
3824       getIConstantVRegValWithLookThrough(VOffset, MRI);
3825   Optional<ValueAndVReg> MaybeSOffsetVal =
3826       getIConstantVRegValWithLookThrough(SOffset, MRI);
3827   Optional<ValueAndVReg> MaybeVIndexVal =
3828       getIConstantVRegValWithLookThrough(VIndex, MRI);
3829   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3830   // update the MMO with that offset. The stride is unknown so we can only do
3831   // this if VIndex is constant 0.
3832   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3833       MaybeVIndexVal->Value == 0) {
3834     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3835                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3836     MMO->setOffset(TotalOffset);
3837   } else {
3838     // We don't have a constant combined offset to use in the MMO. Give up.
3839     MMO->setValue((Value *)nullptr);
3840   }
3841 }
3842 
3843 /// Handle register layout difference for f16 images for some subtargets.
3844 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3845                                              MachineRegisterInfo &MRI,
3846                                              Register Reg,
3847                                              bool ImageStore) const {
3848   const LLT S16 = LLT::scalar(16);
3849   const LLT S32 = LLT::scalar(32);
3850   LLT StoreVT = MRI.getType(Reg);
3851   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3852 
3853   if (ST.hasUnpackedD16VMem()) {
3854     auto Unmerge = B.buildUnmerge(S16, Reg);
3855 
3856     SmallVector<Register, 4> WideRegs;
3857     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3858       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3859 
3860     int NumElts = StoreVT.getNumElements();
3861 
3862     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3863         .getReg(0);
3864   }
3865 
3866   if (ImageStore && ST.hasImageStoreD16Bug()) {
3867     if (StoreVT.getNumElements() == 2) {
3868       SmallVector<Register, 4> PackedRegs;
3869       Reg = B.buildBitcast(S32, Reg).getReg(0);
3870       PackedRegs.push_back(Reg);
3871       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3872       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3873           .getReg(0);
3874     }
3875 
3876     if (StoreVT.getNumElements() == 3) {
3877       SmallVector<Register, 4> PackedRegs;
3878       auto Unmerge = B.buildUnmerge(S16, Reg);
3879       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3880         PackedRegs.push_back(Unmerge.getReg(I));
3881       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3882       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3883       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3884     }
3885 
3886     if (StoreVT.getNumElements() == 4) {
3887       SmallVector<Register, 4> PackedRegs;
3888       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3889       auto Unmerge = B.buildUnmerge(S32, Reg);
3890       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3891         PackedRegs.push_back(Unmerge.getReg(I));
3892       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3893       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3894           .getReg(0);
3895     }
3896 
3897     llvm_unreachable("invalid data type");
3898   }
3899 
3900   if (StoreVT == LLT::fixed_vector(3, S16)) {
3901     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3902               .getReg(0);
3903   }
3904   return Reg;
3905 }
3906 
3907 Register AMDGPULegalizerInfo::fixStoreSourceType(
3908   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3909   MachineRegisterInfo *MRI = B.getMRI();
3910   LLT Ty = MRI->getType(VData);
3911 
3912   const LLT S16 = LLT::scalar(16);
3913 
3914   // Fixup illegal register types for i8 stores.
3915   if (Ty == LLT::scalar(8) || Ty == S16) {
3916     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3917     return AnyExt;
3918   }
3919 
3920   if (Ty.isVector()) {
3921     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3922       if (IsFormat)
3923         return handleD16VData(B, *MRI, VData);
3924     }
3925   }
3926 
3927   return VData;
3928 }
3929 
3930 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3931                                               MachineRegisterInfo &MRI,
3932                                               MachineIRBuilder &B,
3933                                               bool IsTyped,
3934                                               bool IsFormat) const {
3935   Register VData = MI.getOperand(1).getReg();
3936   LLT Ty = MRI.getType(VData);
3937   LLT EltTy = Ty.getScalarType();
3938   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3939   const LLT S32 = LLT::scalar(32);
3940 
3941   VData = fixStoreSourceType(B, VData, IsFormat);
3942   Register RSrc = MI.getOperand(2).getReg();
3943 
3944   MachineMemOperand *MMO = *MI.memoperands_begin();
3945   const int MemSize = MMO->getSize();
3946 
3947   unsigned ImmOffset;
3948 
3949   // The typed intrinsics add an immediate after the registers.
3950   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3951 
3952   // The struct intrinsic variants add one additional operand over raw.
3953   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3954   Register VIndex;
3955   int OpOffset = 0;
3956   if (HasVIndex) {
3957     VIndex = MI.getOperand(3).getReg();
3958     OpOffset = 1;
3959   } else {
3960     VIndex = B.buildConstant(S32, 0).getReg(0);
3961   }
3962 
3963   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3964   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3965 
3966   unsigned Format = 0;
3967   if (IsTyped) {
3968     Format = MI.getOperand(5 + OpOffset).getImm();
3969     ++OpOffset;
3970   }
3971 
3972   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3973 
3974   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3975   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3976 
3977   unsigned Opc;
3978   if (IsTyped) {
3979     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3980                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3981   } else if (IsFormat) {
3982     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3983                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3984   } else {
3985     switch (MemSize) {
3986     case 1:
3987       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3988       break;
3989     case 2:
3990       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3991       break;
3992     default:
3993       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3994       break;
3995     }
3996   }
3997 
3998   auto MIB = B.buildInstr(Opc)
3999     .addUse(VData)              // vdata
4000     .addUse(RSrc)               // rsrc
4001     .addUse(VIndex)             // vindex
4002     .addUse(VOffset)            // voffset
4003     .addUse(SOffset)            // soffset
4004     .addImm(ImmOffset);         // offset(imm)
4005 
4006   if (IsTyped)
4007     MIB.addImm(Format);
4008 
4009   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4010      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4011      .addMemOperand(MMO);
4012 
4013   MI.eraseFromParent();
4014   return true;
4015 }
4016 
4017 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4018                                              MachineRegisterInfo &MRI,
4019                                              MachineIRBuilder &B,
4020                                              bool IsFormat,
4021                                              bool IsTyped) const {
4022   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4023   MachineMemOperand *MMO = *MI.memoperands_begin();
4024   const LLT MemTy = MMO->getMemoryType();
4025   const LLT S32 = LLT::scalar(32);
4026 
4027   Register Dst = MI.getOperand(0).getReg();
4028   Register RSrc = MI.getOperand(2).getReg();
4029 
4030   // The typed intrinsics add an immediate after the registers.
4031   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4032 
4033   // The struct intrinsic variants add one additional operand over raw.
4034   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4035   Register VIndex;
4036   int OpOffset = 0;
4037   if (HasVIndex) {
4038     VIndex = MI.getOperand(3).getReg();
4039     OpOffset = 1;
4040   } else {
4041     VIndex = B.buildConstant(S32, 0).getReg(0);
4042   }
4043 
4044   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4045   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4046 
4047   unsigned Format = 0;
4048   if (IsTyped) {
4049     Format = MI.getOperand(5 + OpOffset).getImm();
4050     ++OpOffset;
4051   }
4052 
4053   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4054   unsigned ImmOffset;
4055 
4056   LLT Ty = MRI.getType(Dst);
4057   LLT EltTy = Ty.getScalarType();
4058   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4059   const bool Unpacked = ST.hasUnpackedD16VMem();
4060 
4061   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4062   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4063 
4064   unsigned Opc;
4065 
4066   if (IsTyped) {
4067     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4068                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4069   } else if (IsFormat) {
4070     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4071                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4072   } else {
4073     switch (MemTy.getSizeInBits()) {
4074     case 8:
4075       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4076       break;
4077     case 16:
4078       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4079       break;
4080     default:
4081       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4082       break;
4083     }
4084   }
4085 
4086   Register LoadDstReg;
4087 
4088   bool IsExtLoad =
4089       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4090   LLT UnpackedTy = Ty.changeElementSize(32);
4091 
4092   if (IsExtLoad)
4093     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4094   else if (Unpacked && IsD16 && Ty.isVector())
4095     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4096   else
4097     LoadDstReg = Dst;
4098 
4099   auto MIB = B.buildInstr(Opc)
4100     .addDef(LoadDstReg)         // vdata
4101     .addUse(RSrc)               // rsrc
4102     .addUse(VIndex)             // vindex
4103     .addUse(VOffset)            // voffset
4104     .addUse(SOffset)            // soffset
4105     .addImm(ImmOffset);         // offset(imm)
4106 
4107   if (IsTyped)
4108     MIB.addImm(Format);
4109 
4110   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4111      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4112      .addMemOperand(MMO);
4113 
4114   if (LoadDstReg != Dst) {
4115     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4116 
4117     // Widen result for extending loads was widened.
4118     if (IsExtLoad)
4119       B.buildTrunc(Dst, LoadDstReg);
4120     else {
4121       // Repack to original 16-bit vector result
4122       // FIXME: G_TRUNC should work, but legalization currently fails
4123       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4124       SmallVector<Register, 4> Repack;
4125       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4126         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4127       B.buildMerge(Dst, Repack);
4128     }
4129   }
4130 
4131   MI.eraseFromParent();
4132   return true;
4133 }
4134 
4135 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4136                                                MachineIRBuilder &B,
4137                                                bool IsInc) const {
4138   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4139                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4140   B.buildInstr(Opc)
4141     .addDef(MI.getOperand(0).getReg())
4142     .addUse(MI.getOperand(2).getReg())
4143     .addUse(MI.getOperand(3).getReg())
4144     .cloneMemRefs(MI);
4145   MI.eraseFromParent();
4146   return true;
4147 }
4148 
4149 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4150   switch (IntrID) {
4151   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4152   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4153     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4154   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4155   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4156     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4157   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4158   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4159     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4160   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4161   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4162     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4163   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4164   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4165     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4166   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4167   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4168     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4169   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4170   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4171     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4172   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4173   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4174     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4175   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4176   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4177     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4178   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4179   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4180     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4181   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4182   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4183     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4184   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4185   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4186     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4187   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4188   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4189     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4190   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4191   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4192     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4193   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4194   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4195     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4196   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4197   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4198     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4199   default:
4200     llvm_unreachable("unhandled atomic opcode");
4201   }
4202 }
4203 
4204 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4205                                                MachineIRBuilder &B,
4206                                                Intrinsic::ID IID) const {
4207   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4208                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4209   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4210 
4211   Register Dst;
4212 
4213   int OpOffset = 0;
4214   if (HasReturn) {
4215     // A few FP atomics do not support return values.
4216     Dst = MI.getOperand(0).getReg();
4217   } else {
4218     OpOffset = -1;
4219   }
4220 
4221   Register VData = MI.getOperand(2 + OpOffset).getReg();
4222   Register CmpVal;
4223 
4224   if (IsCmpSwap) {
4225     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4226     ++OpOffset;
4227   }
4228 
4229   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4230   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4231 
4232   // The struct intrinsic variants add one additional operand over raw.
4233   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4234   Register VIndex;
4235   if (HasVIndex) {
4236     VIndex = MI.getOperand(4 + OpOffset).getReg();
4237     ++OpOffset;
4238   } else {
4239     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4240   }
4241 
4242   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4243   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4244   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4245 
4246   MachineMemOperand *MMO = *MI.memoperands_begin();
4247 
4248   unsigned ImmOffset;
4249   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4250   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4251 
4252   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4253 
4254   if (HasReturn)
4255     MIB.addDef(Dst);
4256 
4257   MIB.addUse(VData); // vdata
4258 
4259   if (IsCmpSwap)
4260     MIB.addReg(CmpVal);
4261 
4262   MIB.addUse(RSrc)               // rsrc
4263      .addUse(VIndex)             // vindex
4264      .addUse(VOffset)            // voffset
4265      .addUse(SOffset)            // soffset
4266      .addImm(ImmOffset)          // offset(imm)
4267      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4268      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4269      .addMemOperand(MMO);
4270 
4271   MI.eraseFromParent();
4272   return true;
4273 }
4274 
4275 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4276 /// vector with s16 typed elements.
4277 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4278                                       SmallVectorImpl<Register> &PackedAddrs,
4279                                       unsigned ArgOffset,
4280                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4281                                       bool IsA16, bool IsG16) {
4282   const LLT S16 = LLT::scalar(16);
4283   const LLT V2S16 = LLT::fixed_vector(2, 16);
4284   auto EndIdx = Intr->VAddrEnd;
4285 
4286   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4287     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4288     if (!SrcOp.isReg())
4289       continue; // _L to _LZ may have eliminated this.
4290 
4291     Register AddrReg = SrcOp.getReg();
4292 
4293     if ((I < Intr->GradientStart) ||
4294         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4295         (I >= Intr->CoordStart && !IsA16)) {
4296       if ((I < Intr->GradientStart) && IsA16 &&
4297           (B.getMRI()->getType(AddrReg) == S16)) {
4298         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4299         // Special handling of bias when A16 is on. Bias is of type half but
4300         // occupies full 32-bit.
4301         PackedAddrs.push_back(
4302             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4303                 .getReg(0));
4304       } else {
4305         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4306                "Bias needs to be converted to 16 bit in A16 mode");
4307         // Handle any gradient or coordinate operands that should not be packed
4308         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4309         PackedAddrs.push_back(AddrReg);
4310       }
4311     } else {
4312       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4313       // derivatives dx/dh and dx/dv are packed with undef.
4314       if (((I + 1) >= EndIdx) ||
4315           ((Intr->NumGradients / 2) % 2 == 1 &&
4316            (I == static_cast<unsigned>(Intr->GradientStart +
4317                                        (Intr->NumGradients / 2) - 1) ||
4318             I == static_cast<unsigned>(Intr->GradientStart +
4319                                        Intr->NumGradients - 1))) ||
4320           // Check for _L to _LZ optimization
4321           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4322         PackedAddrs.push_back(
4323             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4324                 .getReg(0));
4325       } else {
4326         PackedAddrs.push_back(
4327             B.buildBuildVector(
4328                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4329                 .getReg(0));
4330         ++I;
4331       }
4332     }
4333   }
4334 }
4335 
4336 /// Convert from separate vaddr components to a single vector address register,
4337 /// and replace the remaining operands with $noreg.
4338 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4339                                      int DimIdx, int NumVAddrs) {
4340   const LLT S32 = LLT::scalar(32);
4341 
4342   SmallVector<Register, 8> AddrRegs;
4343   for (int I = 0; I != NumVAddrs; ++I) {
4344     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4345     if (SrcOp.isReg()) {
4346       AddrRegs.push_back(SrcOp.getReg());
4347       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4348     }
4349   }
4350 
4351   int NumAddrRegs = AddrRegs.size();
4352   if (NumAddrRegs != 1) {
4353     // Above 8 elements round up to next power of 2 (i.e. 16).
4354     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4355       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4356       auto Undef = B.buildUndef(S32);
4357       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4358       NumAddrRegs = RoundedNumRegs;
4359     }
4360 
4361     auto VAddr =
4362         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4363     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4364   }
4365 
4366   for (int I = 1; I != NumVAddrs; ++I) {
4367     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4368     if (SrcOp.isReg())
4369       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4370   }
4371 }
4372 
4373 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4374 ///
4375 /// Depending on the subtarget, load/store with 16-bit element data need to be
4376 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4377 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4378 /// registers.
4379 ///
4380 /// We don't want to directly select image instructions just yet, but also want
4381 /// to exposes all register repacking to the legalizer/combiners. We also don't
4382 /// want a selected instruction entering RegBankSelect. In order to avoid
4383 /// defining a multitude of intermediate image instructions, directly hack on
4384 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4385 /// padding now unnecessary arguments with $noreg.
4386 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4387     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4388     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4389 
4390   const unsigned NumDefs = MI.getNumExplicitDefs();
4391   const unsigned ArgOffset = NumDefs + 1;
4392   bool IsTFE = NumDefs == 2;
4393   // We are only processing the operands of d16 image operations on subtargets
4394   // that use the unpacked register layout, or need to repack the TFE result.
4395 
4396   // TODO: Do we need to guard against already legalized intrinsics?
4397   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4398       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4399 
4400   MachineRegisterInfo *MRI = B.getMRI();
4401   const LLT S32 = LLT::scalar(32);
4402   const LLT S16 = LLT::scalar(16);
4403   const LLT V2S16 = LLT::fixed_vector(2, 16);
4404 
4405   unsigned DMask = 0;
4406   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4407   LLT Ty = MRI->getType(VData);
4408 
4409   // Check for 16 bit addresses and pack if true.
4410   LLT GradTy =
4411       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4412   LLT AddrTy =
4413       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4414   const bool IsG16 = GradTy == S16;
4415   const bool IsA16 = AddrTy == S16;
4416   const bool IsD16 = Ty.getScalarType() == S16;
4417 
4418   int DMaskLanes = 0;
4419   if (!BaseOpcode->Atomic) {
4420     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4421     if (BaseOpcode->Gather4) {
4422       DMaskLanes = 4;
4423     } else if (DMask != 0) {
4424       DMaskLanes = countPopulation(DMask);
4425     } else if (!IsTFE && !BaseOpcode->Store) {
4426       // If dmask is 0, this is a no-op load. This can be eliminated.
4427       B.buildUndef(MI.getOperand(0));
4428       MI.eraseFromParent();
4429       return true;
4430     }
4431   }
4432 
4433   Observer.changingInstr(MI);
4434   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4435 
4436   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4437                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4438   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4439                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4440   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4441 
4442   // Track that we legalized this
4443   MI.setDesc(B.getTII().get(NewOpcode));
4444 
4445   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4446   // dmask to be at least 1 otherwise the instruction will fail
4447   if (IsTFE && DMask == 0) {
4448     DMask = 0x1;
4449     DMaskLanes = 1;
4450     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4451   }
4452 
4453   if (BaseOpcode->Atomic) {
4454     Register VData0 = MI.getOperand(2).getReg();
4455     LLT Ty = MRI->getType(VData0);
4456 
4457     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4458     if (Ty.isVector())
4459       return false;
4460 
4461     if (BaseOpcode->AtomicX2) {
4462       Register VData1 = MI.getOperand(3).getReg();
4463       // The two values are packed in one register.
4464       LLT PackedTy = LLT::fixed_vector(2, Ty);
4465       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4466       MI.getOperand(2).setReg(Concat.getReg(0));
4467       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4468     }
4469   }
4470 
4471   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4472 
4473   // Rewrite the addressing register layout before doing anything else.
4474   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4475     // 16 bit gradients are supported, but are tied to the A16 control
4476     // so both gradients and addresses must be 16 bit
4477     return false;
4478   }
4479 
4480   if (IsA16 && !ST.hasA16()) {
4481     // A16 not supported
4482     return false;
4483   }
4484 
4485   if (IsA16 || IsG16) {
4486     if (Intr->NumVAddrs > 1) {
4487       SmallVector<Register, 4> PackedRegs;
4488 
4489       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4490                                 IsG16);
4491 
4492       // See also below in the non-a16 branch
4493       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4494                           PackedRegs.size() <= ST.getNSAMaxSize();
4495 
4496       if (!UseNSA && PackedRegs.size() > 1) {
4497         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4498         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4499         PackedRegs[0] = Concat.getReg(0);
4500         PackedRegs.resize(1);
4501       }
4502 
4503       const unsigned NumPacked = PackedRegs.size();
4504       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4505         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4506         if (!SrcOp.isReg()) {
4507           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4508           continue;
4509         }
4510 
4511         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4512 
4513         if (I - Intr->VAddrStart < NumPacked)
4514           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4515         else
4516           SrcOp.setReg(AMDGPU::NoRegister);
4517       }
4518     }
4519   } else {
4520     // If the register allocator cannot place the address registers contiguously
4521     // without introducing moves, then using the non-sequential address encoding
4522     // is always preferable, since it saves VALU instructions and is usually a
4523     // wash in terms of code size or even better.
4524     //
4525     // However, we currently have no way of hinting to the register allocator
4526     // that MIMG addresses should be placed contiguously when it is possible to
4527     // do so, so force non-NSA for the common 2-address case as a heuristic.
4528     //
4529     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4530     // allocation when possible.
4531     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4532                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4533 
4534     if (!UseNSA && Intr->NumVAddrs > 1)
4535       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4536                                Intr->NumVAddrs);
4537   }
4538 
4539   int Flags = 0;
4540   if (IsA16)
4541     Flags |= 1;
4542   if (IsG16)
4543     Flags |= 2;
4544   MI.addOperand(MachineOperand::CreateImm(Flags));
4545 
4546   if (BaseOpcode->Store) { // No TFE for stores?
4547     // TODO: Handle dmask trim
4548     if (!Ty.isVector() || !IsD16)
4549       return true;
4550 
4551     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4552     if (RepackedReg != VData) {
4553       MI.getOperand(1).setReg(RepackedReg);
4554     }
4555 
4556     return true;
4557   }
4558 
4559   Register DstReg = MI.getOperand(0).getReg();
4560   const LLT EltTy = Ty.getScalarType();
4561   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4562 
4563   // Confirm that the return type is large enough for the dmask specified
4564   if (NumElts < DMaskLanes)
4565     return false;
4566 
4567   if (NumElts > 4 || DMaskLanes > 4)
4568     return false;
4569 
4570   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4571   const LLT AdjustedTy =
4572       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4573 
4574   // The raw dword aligned data component of the load. The only legal cases
4575   // where this matters should be when using the packed D16 format, for
4576   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4577   LLT RoundedTy;
4578 
4579   // S32 vector to to cover all data, plus TFE result element.
4580   LLT TFETy;
4581 
4582   // Register type to use for each loaded component. Will be S32 or V2S16.
4583   LLT RegTy;
4584 
4585   if (IsD16 && ST.hasUnpackedD16VMem()) {
4586     RoundedTy =
4587         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4588     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4589     RegTy = S32;
4590   } else {
4591     unsigned EltSize = EltTy.getSizeInBits();
4592     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4593     unsigned RoundedSize = 32 * RoundedElts;
4594     RoundedTy = LLT::scalarOrVector(
4595         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4596     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4597     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4598   }
4599 
4600   // The return type does not need adjustment.
4601   // TODO: Should we change s16 case to s32 or <2 x s16>?
4602   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4603     return true;
4604 
4605   Register Dst1Reg;
4606 
4607   // Insert after the instruction.
4608   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4609 
4610   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4611   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4612   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4613   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4614 
4615   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4616 
4617   MI.getOperand(0).setReg(NewResultReg);
4618 
4619   // In the IR, TFE is supposed to be used with a 2 element struct return
4620   // type. The instruction really returns these two values in one contiguous
4621   // register, with one additional dword beyond the loaded data. Rewrite the
4622   // return type to use a single register result.
4623 
4624   if (IsTFE) {
4625     Dst1Reg = MI.getOperand(1).getReg();
4626     if (MRI->getType(Dst1Reg) != S32)
4627       return false;
4628 
4629     // TODO: Make sure the TFE operand bit is set.
4630     MI.RemoveOperand(1);
4631 
4632     // Handle the easy case that requires no repack instructions.
4633     if (Ty == S32) {
4634       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4635       return true;
4636     }
4637   }
4638 
4639   // Now figure out how to copy the new result register back into the old
4640   // result.
4641   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4642 
4643   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4644 
4645   if (ResultNumRegs == 1) {
4646     assert(!IsTFE);
4647     ResultRegs[0] = NewResultReg;
4648   } else {
4649     // We have to repack into a new vector of some kind.
4650     for (int I = 0; I != NumDataRegs; ++I)
4651       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4652     B.buildUnmerge(ResultRegs, NewResultReg);
4653 
4654     // Drop the final TFE element to get the data part. The TFE result is
4655     // directly written to the right place already.
4656     if (IsTFE)
4657       ResultRegs.resize(NumDataRegs);
4658   }
4659 
4660   // For an s16 scalar result, we form an s32 result with a truncate regardless
4661   // of packed vs. unpacked.
4662   if (IsD16 && !Ty.isVector()) {
4663     B.buildTrunc(DstReg, ResultRegs[0]);
4664     return true;
4665   }
4666 
4667   // Avoid a build/concat_vector of 1 entry.
4668   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4669     B.buildBitcast(DstReg, ResultRegs[0]);
4670     return true;
4671   }
4672 
4673   assert(Ty.isVector());
4674 
4675   if (IsD16) {
4676     // For packed D16 results with TFE enabled, all the data components are
4677     // S32. Cast back to the expected type.
4678     //
4679     // TODO: We don't really need to use load s32 elements. We would only need one
4680     // cast for the TFE result if a multiple of v2s16 was used.
4681     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4682       for (Register &Reg : ResultRegs)
4683         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4684     } else if (ST.hasUnpackedD16VMem()) {
4685       for (Register &Reg : ResultRegs)
4686         Reg = B.buildTrunc(S16, Reg).getReg(0);
4687     }
4688   }
4689 
4690   auto padWithUndef = [&](LLT Ty, int NumElts) {
4691     if (NumElts == 0)
4692       return;
4693     Register Undef = B.buildUndef(Ty).getReg(0);
4694     for (int I = 0; I != NumElts; ++I)
4695       ResultRegs.push_back(Undef);
4696   };
4697 
4698   // Pad out any elements eliminated due to the dmask.
4699   LLT ResTy = MRI->getType(ResultRegs[0]);
4700   if (!ResTy.isVector()) {
4701     padWithUndef(ResTy, NumElts - ResultRegs.size());
4702     B.buildBuildVector(DstReg, ResultRegs);
4703     return true;
4704   }
4705 
4706   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4707   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4708 
4709   // Deal with the one annoying legal case.
4710   const LLT V3S16 = LLT::fixed_vector(3, 16);
4711   if (Ty == V3S16) {
4712     if (IsTFE) {
4713       if (ResultRegs.size() == 1) {
4714         NewResultReg = ResultRegs[0];
4715       } else if (ResultRegs.size() == 2) {
4716         LLT V4S16 = LLT::fixed_vector(4, 16);
4717         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4718       } else {
4719         return false;
4720       }
4721     }
4722 
4723     if (MRI->getType(DstReg).getNumElements() <
4724         MRI->getType(NewResultReg).getNumElements()) {
4725       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4726     } else {
4727       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4728     }
4729     return true;
4730   }
4731 
4732   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4733   B.buildConcatVectors(DstReg, ResultRegs);
4734   return true;
4735 }
4736 
4737 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4738   LegalizerHelper &Helper, MachineInstr &MI) const {
4739   MachineIRBuilder &B = Helper.MIRBuilder;
4740   GISelChangeObserver &Observer = Helper.Observer;
4741 
4742   Register Dst = MI.getOperand(0).getReg();
4743   LLT Ty = B.getMRI()->getType(Dst);
4744   unsigned Size = Ty.getSizeInBits();
4745   MachineFunction &MF = B.getMF();
4746 
4747   Observer.changingInstr(MI);
4748 
4749   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4750     Ty = getBitcastRegisterType(Ty);
4751     Helper.bitcastDst(MI, Ty, 0);
4752     Dst = MI.getOperand(0).getReg();
4753     B.setInsertPt(B.getMBB(), MI);
4754   }
4755 
4756   // FIXME: We don't really need this intermediate instruction. The intrinsic
4757   // should be fixed to have a memory operand. Since it's readnone, we're not
4758   // allowed to add one.
4759   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4760   MI.RemoveOperand(1); // Remove intrinsic ID
4761 
4762   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4763   // TODO: Should this use datalayout alignment?
4764   const unsigned MemSize = (Size + 7) / 8;
4765   const Align MemAlign(4);
4766   MachineMemOperand *MMO = MF.getMachineMemOperand(
4767       MachinePointerInfo(),
4768       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4769           MachineMemOperand::MOInvariant,
4770       MemSize, MemAlign);
4771   MI.addMemOperand(MF, MMO);
4772 
4773   // There are no 96-bit result scalar loads, but widening to 128-bit should
4774   // always be legal. We may need to restore this to a 96-bit result if it turns
4775   // out this needs to be converted to a vector load during RegBankSelect.
4776   if (!isPowerOf2_32(Size)) {
4777     if (Ty.isVector())
4778       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4779     else
4780       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4781   }
4782 
4783   Observer.changedInstr(MI);
4784   return true;
4785 }
4786 
4787 // TODO: Move to selection
4788 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4789                                                 MachineRegisterInfo &MRI,
4790                                                 MachineIRBuilder &B) const {
4791   if (!ST.isTrapHandlerEnabled() ||
4792       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4793     return legalizeTrapEndpgm(MI, MRI, B);
4794 
4795   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4796     switch (*HsaAbiVer) {
4797     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4798     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4799       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4800     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4801     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4802       return ST.supportsGetDoorbellID() ?
4803           legalizeTrapHsa(MI, MRI, B) :
4804           legalizeTrapHsaQueuePtr(MI, MRI, B);
4805     }
4806   }
4807 
4808   llvm_unreachable("Unknown trap handler");
4809 }
4810 
4811 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4812     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4813   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4814   MI.eraseFromParent();
4815   return true;
4816 }
4817 
4818 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4819     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4820   // Pass queue pointer to trap handler as input, and insert trap instruction
4821   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4822   Register LiveIn =
4823     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4824   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4825     return false;
4826 
4827   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4828   B.buildCopy(SGPR01, LiveIn);
4829   B.buildInstr(AMDGPU::S_TRAP)
4830       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4831       .addReg(SGPR01, RegState::Implicit);
4832 
4833   MI.eraseFromParent();
4834   return true;
4835 }
4836 
4837 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4838     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4839   B.buildInstr(AMDGPU::S_TRAP)
4840       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4841   MI.eraseFromParent();
4842   return true;
4843 }
4844 
4845 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4846     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4847   // Is non-HSA path or trap-handler disabled? Then, report a warning
4848   // accordingly
4849   if (!ST.isTrapHandlerEnabled() ||
4850       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4851     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4852                                      "debugtrap handler not supported",
4853                                      MI.getDebugLoc(), DS_Warning);
4854     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4855     Ctx.diagnose(NoTrap);
4856   } else {
4857     // Insert debug-trap instruction
4858     B.buildInstr(AMDGPU::S_TRAP)
4859         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4860   }
4861 
4862   MI.eraseFromParent();
4863   return true;
4864 }
4865 
4866 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4867                                                MachineIRBuilder &B) const {
4868   MachineRegisterInfo &MRI = *B.getMRI();
4869   const LLT S16 = LLT::scalar(16);
4870   const LLT S32 = LLT::scalar(32);
4871 
4872   Register DstReg = MI.getOperand(0).getReg();
4873   Register NodePtr = MI.getOperand(2).getReg();
4874   Register RayExtent = MI.getOperand(3).getReg();
4875   Register RayOrigin = MI.getOperand(4).getReg();
4876   Register RayDir = MI.getOperand(5).getReg();
4877   Register RayInvDir = MI.getOperand(6).getReg();
4878   Register TDescr = MI.getOperand(7).getReg();
4879 
4880   if (!ST.hasGFX10_AEncoding()) {
4881     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4882                                         "intrinsic not supported on subtarget",
4883                                         MI.getDebugLoc());
4884     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4885     return false;
4886   }
4887 
4888   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4889   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4890   const unsigned NumVDataDwords = 4;
4891   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4892   const bool UseNSA =
4893       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4894   const unsigned BaseOpcodes[2][2] = {
4895       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4896       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4897        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4898   int Opcode;
4899   if (UseNSA) {
4900     Opcode =
4901         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4902                               NumVDataDwords, NumVAddrDwords);
4903   } else {
4904     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4905                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4906                                    PowerOf2Ceil(NumVAddrDwords));
4907   }
4908   assert(Opcode != -1);
4909 
4910   SmallVector<Register, 12> Ops;
4911   if (Is64) {
4912     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4913     Ops.push_back(Unmerge.getReg(0));
4914     Ops.push_back(Unmerge.getReg(1));
4915   } else {
4916     Ops.push_back(NodePtr);
4917   }
4918   Ops.push_back(RayExtent);
4919 
4920   auto packLanes = [&Ops, &S32, &B](Register Src) {
4921     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4922     Ops.push_back(Unmerge.getReg(0));
4923     Ops.push_back(Unmerge.getReg(1));
4924     Ops.push_back(Unmerge.getReg(2));
4925   };
4926 
4927   packLanes(RayOrigin);
4928   if (IsA16) {
4929     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4930     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4931     Register R1 = MRI.createGenericVirtualRegister(S32);
4932     Register R2 = MRI.createGenericVirtualRegister(S32);
4933     Register R3 = MRI.createGenericVirtualRegister(S32);
4934     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4935     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4936     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4937     Ops.push_back(R1);
4938     Ops.push_back(R2);
4939     Ops.push_back(R3);
4940   } else {
4941     packLanes(RayDir);
4942     packLanes(RayInvDir);
4943   }
4944 
4945   if (!UseNSA) {
4946     // Build a single vector containing all the operands so far prepared.
4947     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4948     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4949     Ops.clear();
4950     Ops.push_back(MergedOps);
4951   }
4952 
4953   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4954     .addDef(DstReg)
4955     .addImm(Opcode);
4956 
4957   for (Register R : Ops) {
4958     MIB.addUse(R);
4959   }
4960 
4961   MIB.addUse(TDescr)
4962      .addImm(IsA16 ? 1 : 0)
4963      .cloneMemRefs(MI);
4964 
4965   MI.eraseFromParent();
4966   return true;
4967 }
4968 
4969 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4970   B.buildConstant(MI.getOperand(0).getReg(), C);
4971   MI.eraseFromParent();
4972   return true;
4973 }
4974 
4975 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
4976                                                MachineIRBuilder &B) const {
4977   unsigned Opc;
4978   int RoundMode = MI.getOperand(2).getImm();
4979 
4980   if (RoundMode == (int)RoundingMode::TowardPositive)
4981     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
4982   else if (RoundMode == (int)RoundingMode::TowardNegative)
4983     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
4984   else
4985     return false;
4986 
4987   B.buildInstr(Opc)
4988       .addDef(MI.getOperand(0).getReg())
4989       .addUse(MI.getOperand(1).getReg());
4990 
4991   MI.eraseFromParent();
4992 
4993   return true;
4994 }
4995 
4996 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4997                                             MachineInstr &MI) const {
4998   MachineIRBuilder &B = Helper.MIRBuilder;
4999   MachineRegisterInfo &MRI = *B.getMRI();
5000 
5001   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5002   auto IntrID = MI.getIntrinsicID();
5003   switch (IntrID) {
5004   case Intrinsic::amdgcn_if:
5005   case Intrinsic::amdgcn_else: {
5006     MachineInstr *Br = nullptr;
5007     MachineBasicBlock *UncondBrTarget = nullptr;
5008     bool Negated = false;
5009     if (MachineInstr *BrCond =
5010             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5011       const SIRegisterInfo *TRI
5012         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5013 
5014       Register Def = MI.getOperand(1).getReg();
5015       Register Use = MI.getOperand(3).getReg();
5016 
5017       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5018 
5019       if (Negated)
5020         std::swap(CondBrTarget, UncondBrTarget);
5021 
5022       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5023       if (IntrID == Intrinsic::amdgcn_if) {
5024         B.buildInstr(AMDGPU::SI_IF)
5025           .addDef(Def)
5026           .addUse(Use)
5027           .addMBB(UncondBrTarget);
5028       } else {
5029         B.buildInstr(AMDGPU::SI_ELSE)
5030             .addDef(Def)
5031             .addUse(Use)
5032             .addMBB(UncondBrTarget);
5033       }
5034 
5035       if (Br) {
5036         Br->getOperand(0).setMBB(CondBrTarget);
5037       } else {
5038         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5039         // since we're swapping branch targets it needs to be reinserted.
5040         // FIXME: IRTranslator should probably not do this
5041         B.buildBr(*CondBrTarget);
5042       }
5043 
5044       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5045       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5046       MI.eraseFromParent();
5047       BrCond->eraseFromParent();
5048       return true;
5049     }
5050 
5051     return false;
5052   }
5053   case Intrinsic::amdgcn_loop: {
5054     MachineInstr *Br = nullptr;
5055     MachineBasicBlock *UncondBrTarget = nullptr;
5056     bool Negated = false;
5057     if (MachineInstr *BrCond =
5058             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5059       const SIRegisterInfo *TRI
5060         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5061 
5062       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5063       Register Reg = MI.getOperand(2).getReg();
5064 
5065       if (Negated)
5066         std::swap(CondBrTarget, UncondBrTarget);
5067 
5068       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5069       B.buildInstr(AMDGPU::SI_LOOP)
5070         .addUse(Reg)
5071         .addMBB(UncondBrTarget);
5072 
5073       if (Br)
5074         Br->getOperand(0).setMBB(CondBrTarget);
5075       else
5076         B.buildBr(*CondBrTarget);
5077 
5078       MI.eraseFromParent();
5079       BrCond->eraseFromParent();
5080       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5081       return true;
5082     }
5083 
5084     return false;
5085   }
5086   case Intrinsic::amdgcn_kernarg_segment_ptr:
5087     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5088       // This only makes sense to call in a kernel, so just lower to null.
5089       B.buildConstant(MI.getOperand(0).getReg(), 0);
5090       MI.eraseFromParent();
5091       return true;
5092     }
5093 
5094     return legalizePreloadedArgIntrin(
5095       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5096   case Intrinsic::amdgcn_implicitarg_ptr:
5097     return legalizeImplicitArgPtr(MI, MRI, B);
5098   case Intrinsic::amdgcn_workitem_id_x:
5099     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5100       return replaceWithConstant(B, MI, 0);
5101     return legalizePreloadedArgIntrin(MI, MRI, B,
5102                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5103   case Intrinsic::amdgcn_workitem_id_y:
5104     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5105       return replaceWithConstant(B, MI, 0);
5106 
5107     return legalizePreloadedArgIntrin(MI, MRI, B,
5108                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5109   case Intrinsic::amdgcn_workitem_id_z:
5110     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5111       return replaceWithConstant(B, MI, 0);
5112 
5113     return legalizePreloadedArgIntrin(MI, MRI, B,
5114                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5115   case Intrinsic::amdgcn_workgroup_id_x:
5116     return legalizePreloadedArgIntrin(MI, MRI, B,
5117                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5118   case Intrinsic::amdgcn_workgroup_id_y:
5119     return legalizePreloadedArgIntrin(MI, MRI, B,
5120                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5121   case Intrinsic::amdgcn_workgroup_id_z:
5122     return legalizePreloadedArgIntrin(MI, MRI, B,
5123                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5124   case Intrinsic::amdgcn_dispatch_ptr:
5125     return legalizePreloadedArgIntrin(MI, MRI, B,
5126                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5127   case Intrinsic::amdgcn_queue_ptr:
5128     return legalizePreloadedArgIntrin(MI, MRI, B,
5129                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5130   case Intrinsic::amdgcn_implicit_buffer_ptr:
5131     return legalizePreloadedArgIntrin(
5132       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5133   case Intrinsic::amdgcn_dispatch_id:
5134     return legalizePreloadedArgIntrin(MI, MRI, B,
5135                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5136   case Intrinsic::amdgcn_fdiv_fast:
5137     return legalizeFDIVFastIntrin(MI, MRI, B);
5138   case Intrinsic::amdgcn_is_shared:
5139     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5140   case Intrinsic::amdgcn_is_private:
5141     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5142   case Intrinsic::amdgcn_wavefrontsize: {
5143     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5144     MI.eraseFromParent();
5145     return true;
5146   }
5147   case Intrinsic::amdgcn_s_buffer_load:
5148     return legalizeSBufferLoad(Helper, MI);
5149   case Intrinsic::amdgcn_raw_buffer_store:
5150   case Intrinsic::amdgcn_struct_buffer_store:
5151     return legalizeBufferStore(MI, MRI, B, false, false);
5152   case Intrinsic::amdgcn_raw_buffer_store_format:
5153   case Intrinsic::amdgcn_struct_buffer_store_format:
5154     return legalizeBufferStore(MI, MRI, B, false, true);
5155   case Intrinsic::amdgcn_raw_tbuffer_store:
5156   case Intrinsic::amdgcn_struct_tbuffer_store:
5157     return legalizeBufferStore(MI, MRI, B, true, true);
5158   case Intrinsic::amdgcn_raw_buffer_load:
5159   case Intrinsic::amdgcn_struct_buffer_load:
5160     return legalizeBufferLoad(MI, MRI, B, false, false);
5161   case Intrinsic::amdgcn_raw_buffer_load_format:
5162   case Intrinsic::amdgcn_struct_buffer_load_format:
5163     return legalizeBufferLoad(MI, MRI, B, true, false);
5164   case Intrinsic::amdgcn_raw_tbuffer_load:
5165   case Intrinsic::amdgcn_struct_tbuffer_load:
5166     return legalizeBufferLoad(MI, MRI, B, true, true);
5167   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5168   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5169   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5170   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5171   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5172   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5173   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5174   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5175   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5176   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5177   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5178   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5179   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5180   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5181   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5182   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5183   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5184   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5185   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5186   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5187   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5188   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5189   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5190   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5191   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5192   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5193   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5194   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5195   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5196   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5197     return legalizeBufferAtomic(MI, B, IntrID);
5198   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5199   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5200     Register DstReg = MI.getOperand(0).getReg();
5201     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5202       Function &F = B.getMF().getFunction();
5203       DiagnosticInfoUnsupported NoFpRet(
5204           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5205           DS_Error);
5206       F.getContext().diagnose(NoFpRet);
5207       B.buildUndef(DstReg);
5208       MI.eraseFromParent();
5209       return true;
5210     }
5211 
5212     return legalizeBufferAtomic(MI, B, IntrID);
5213   }
5214   case Intrinsic::amdgcn_atomic_inc:
5215     return legalizeAtomicIncDec(MI, B, true);
5216   case Intrinsic::amdgcn_atomic_dec:
5217     return legalizeAtomicIncDec(MI, B, false);
5218   case Intrinsic::trap:
5219     return legalizeTrapIntrinsic(MI, MRI, B);
5220   case Intrinsic::debugtrap:
5221     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5222   case Intrinsic::amdgcn_rsq_clamp:
5223     return legalizeRsqClampIntrinsic(MI, MRI, B);
5224   case Intrinsic::amdgcn_ds_fadd:
5225   case Intrinsic::amdgcn_ds_fmin:
5226   case Intrinsic::amdgcn_ds_fmax:
5227     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5228   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5229     return legalizeBVHIntrinsic(MI, B);
5230   default: {
5231     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5232             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5233       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5234     return true;
5235   }
5236   }
5237 
5238   return true;
5239 }
5240