1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query,
263                                  unsigned Opcode) {
264   const LLT Ty = Query.Types[0];
265 
266   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267   const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269   unsigned RegSize = Ty.getSizeInBits();
270   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272   unsigned AS = Query.Types[1].getAddressSpace();
273 
274   // All of these need to be custom lowered to cast the pointer operand.
275   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276     return false;
277 
278   // TODO: We should be able to widen loads if the alignment is high enough, but
279   // we also need to modify the memory access size.
280 #if 0
281   // Accept widening loads based on alignment.
282   if (IsLoad && MemSize < Size)
283     MemSize = std::max(MemSize, Align);
284 #endif
285 
286   // Only 1-byte and 2-byte to 32-bit extloads are valid.
287   if (MemSize != RegSize && RegSize != 32)
288     return false;
289 
290   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291     return false;
292 
293   switch (MemSize) {
294   case 8:
295   case 16:
296   case 32:
297   case 64:
298   case 128:
299     break;
300   case 96:
301     if (!ST.hasDwordx3LoadStores())
302       return false;
303     break;
304   case 256:
305   case 512:
306     // These may contextually need to be broken down.
307     break;
308   default:
309     return false;
310   }
311 
312   assert(RegSize >= MemSize);
313 
314   if (AlignBits < MemSize) {
315     const SITargetLowering *TLI = ST.getTargetLowering();
316     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317                                                  Align(AlignBits / 8)))
318       return false;
319   }
320 
321   return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329   if (EnableNewLegality)
330     return false;
331 
332   const unsigned Size = Ty.getSizeInBits();
333   if (Size <= 64)
334     return false;
335   if (!Ty.isVector())
336     return true;
337 
338   LLT EltTy = Ty.getElementType();
339   if (EltTy.isPointer())
340     return true;
341 
342   unsigned EltSize = EltTy.getSizeInBits();
343   return EltSize != 32 && EltSize != 64;
344 }
345 
346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347                              unsigned Opcode) {
348   const LLT Ty = Query.Types[0];
349   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350          !loadStoreBitcastWorkaround(Ty);
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356                                        const unsigned MemSizeInBits) {
357   const unsigned Size = Ty.getSizeInBits();
358     if (Size != MemSizeInBits)
359       return Size <= 32 && Ty.isVector();
360 
361   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362     return true;
363   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364          !isRegisterVectorElementType(Ty.getElementType());
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371                             unsigned AlignInBits, unsigned AddrSpace,
372                             unsigned Opcode) {
373   // We don't want to widen cases that are naturally legal.
374   if (isPowerOf2_32(SizeInBits))
375     return false;
376 
377   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378   // end up widening these for a scalar load during RegBankSelect, since there
379   // aren't 96-bit scalar loads.
380   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381     return false;
382 
383   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384     return false;
385 
386   // A load is known dereferenceable up to the alignment, so it's legal to widen
387   // to it.
388   //
389   // TODO: Could check dereferenceable for less aligned cases.
390   unsigned RoundedSize = NextPowerOf2(SizeInBits);
391   if (AlignInBits < RoundedSize)
392     return false;
393 
394   // Do not widen if it would introduce a slow unaligned load.
395   const SITargetLowering *TLI = ST.getTargetLowering();
396   bool Fast = false;
397   return TLI->allowsMisalignedMemoryAccessesImpl(
398              RoundedSize, AddrSpace, Align(AlignInBits / 8),
399              MachineMemOperand::MOLoad, &Fast) &&
400          Fast;
401 }
402 
403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404                             unsigned Opcode) {
405   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406     return false;
407 
408   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409                          Query.MMODescrs[0].AlignInBits,
410                          Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414                                          const GCNTargetMachine &TM)
415   :  ST(ST_) {
416   using namespace TargetOpcode;
417 
418   auto GetAddrSpacePtr = [&TM](unsigned AS) {
419     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420   };
421 
422   const LLT S1 = LLT::scalar(1);
423   const LLT S8 = LLT::scalar(8);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S8 = LLT::vector(2, 8);
433   const LLT V2S16 = LLT::vector(2, 16);
434   const LLT V4S16 = LLT::vector(4, 16);
435 
436   const LLT V2S32 = LLT::vector(2, 32);
437   const LLT V3S32 = LLT::vector(3, 32);
438   const LLT V4S32 = LLT::vector(4, 32);
439   const LLT V5S32 = LLT::vector(5, 32);
440   const LLT V6S32 = LLT::vector(6, 32);
441   const LLT V7S32 = LLT::vector(7, 32);
442   const LLT V8S32 = LLT::vector(8, 32);
443   const LLT V9S32 = LLT::vector(9, 32);
444   const LLT V10S32 = LLT::vector(10, 32);
445   const LLT V11S32 = LLT::vector(11, 32);
446   const LLT V12S32 = LLT::vector(12, 32);
447   const LLT V13S32 = LLT::vector(13, 32);
448   const LLT V14S32 = LLT::vector(14, 32);
449   const LLT V15S32 = LLT::vector(15, 32);
450   const LLT V16S32 = LLT::vector(16, 32);
451   const LLT V32S32 = LLT::vector(32, 32);
452 
453   const LLT V2S64 = LLT::vector(2, 64);
454   const LLT V3S64 = LLT::vector(3, 64);
455   const LLT V4S64 = LLT::vector(4, 64);
456   const LLT V5S64 = LLT::vector(5, 64);
457   const LLT V6S64 = LLT::vector(6, 64);
458   const LLT V7S64 = LLT::vector(7, 64);
459   const LLT V8S64 = LLT::vector(8, 64);
460   const LLT V16S64 = LLT::vector(16, 64);
461 
462   std::initializer_list<LLT> AllS32Vectors =
463     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465   std::initializer_list<LLT> AllS64Vectors =
466     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476   const LLT CodePtr = FlatPtr;
477 
478   const std::initializer_list<LLT> AddrSpaces64 = {
479     GlobalPtr, ConstantPtr, FlatPtr
480   };
481 
482   const std::initializer_list<LLT> AddrSpaces32 = {
483     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484   };
485 
486   const std::initializer_list<LLT> FPTypesBase = {
487     S32, S64
488   };
489 
490   const std::initializer_list<LLT> FPTypes16 = {
491     S32, S64, S16
492   };
493 
494   const std::initializer_list<LLT> FPTypesPK16 = {
495     S32, S64, S16, V2S16
496   };
497 
498   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500   setAction({G_BRCOND, S1}, Legal); // VCC branches
501   setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504   // elements for v3s16
505   getActionDefinitionsBuilder(G_PHI)
506     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507     .legalFor(AllS32Vectors)
508     .legalFor(AllS64Vectors)
509     .legalFor(AddrSpaces64)
510     .legalFor(AddrSpaces32)
511     .legalIf(isPointer(0))
512     .clampScalar(0, S16, S256)
513     .widenScalarToNextPow2(0, 32)
514     .clampMaxNumElements(0, S32, 16)
515     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516     .scalarize(0);
517 
518   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519     // Full set of gfx9 features.
520     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521       .legalFor({S32, S16, V2S16})
522       .clampScalar(0, S16, S32)
523       .clampMaxNumElements(0, S16, 2)
524       .scalarize(0)
525       .widenScalarToNextPow2(0, 32);
526 
527     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528       .legalFor({S32, S16, V2S16}) // Clamp modifier
529       .minScalarOrElt(0, S16)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32)
533       .lower();
534   } else if (ST.has16BitInsts()) {
535     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536       .legalFor({S32, S16})
537       .clampScalar(0, S16, S32)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541     // Technically the saturating operations require clamp bit support, but this
542     // was introduced at the same time as 16-bit operations.
543     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544       .legalFor({S32, S16}) // Clamp modifier
545       .minScalar(0, S16)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 16)
548       .lower();
549 
550     // We're just lowering this, but it helps get a better result to try to
551     // coerce to the desired type first.
552     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553       .minScalar(0, S16)
554       .scalarize(0)
555       .lower();
556   } else {
557     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558       .legalFor({S32})
559       .clampScalar(0, S32, S32)
560       .scalarize(0);
561 
562     if (ST.hasIntClamp()) {
563       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564         .legalFor({S32}) // Clamp modifier.
565         .scalarize(0)
566         .minScalarOrElt(0, S32)
567         .lower();
568     } else {
569       // Clamp bit support was added in VI, along with 16-bit operations.
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .minScalar(0, S32)
572         .scalarize(0)
573         .lower();
574     }
575 
576     // FIXME: DAG expansion gets better results. The widening uses the smaller
577     // range values and goes for the min/max lowering directly.
578     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579       .minScalar(0, S32)
580       .scalarize(0)
581       .lower();
582   }
583 
584   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585     .customFor({S32, S64})
586     .clampScalar(0, S32, S64)
587     .widenScalarToNextPow2(0, 32)
588     .scalarize(0);
589 
590   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591                    .legalFor({S32})
592                    .maxScalarOrElt(0, S32);
593 
594   if (ST.hasVOP3PInsts()) {
595     Mulh
596       .clampMaxNumElements(0, S8, 2)
597       .lowerFor({V2S8});
598   }
599 
600   Mulh
601     .scalarize(0)
602     .lower();
603 
604   // Report legal for any types we can handle anywhere. For the cases only legal
605   // on the SALU, RegBankSelect will be able to re-legalize.
606   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608     .clampScalar(0, S32, S64)
609     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611     .widenScalarToNextPow2(0)
612     .scalarize(0);
613 
614   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616     .legalFor({{S32, S1}, {S32, S32}})
617     .minScalar(0, S32)
618     // TODO: .scalarize(0)
619     .lower();
620 
621   getActionDefinitionsBuilder(G_BITCAST)
622     // Don't worry about the size constraint.
623     .legalIf(all(isRegisterType(0), isRegisterType(1)))
624     .lower();
625 
626 
627   getActionDefinitionsBuilder(G_CONSTANT)
628     .legalFor({S1, S32, S64, S16, GlobalPtr,
629                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630     .legalIf(isPointer(0))
631     .clampScalar(0, S32, S64)
632     .widenScalarToNextPow2(0);
633 
634   getActionDefinitionsBuilder(G_FCONSTANT)
635     .legalFor({S32, S64, S16})
636     .clampScalar(0, S16, S64);
637 
638   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639       .legalIf(isRegisterType(0))
640       // s1 and s16 are special cases because they have legal operations on
641       // them, but don't really occupy registers in the normal way.
642       .legalFor({S1, S16})
643       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644       .clampScalarOrElt(0, S32, MaxScalar)
645       .widenScalarToNextPow2(0, 32)
646       .clampMaxNumElements(0, S32, 16);
647 
648   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649 
650   // If the amount is divergent, we have to do a wave reduction to get the
651   // maximum value, so this is expanded during RegBankSelect.
652   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653     .legalFor({{PrivatePtr, S32}});
654 
655   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656     .customIf(typeIsNot(0, PrivatePtr));
657 
658   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659 
660   auto &FPOpActions = getActionDefinitionsBuilder(
661     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662     .legalFor({S32, S64});
663   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664     .customFor({S32, S64});
665   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666     .customFor({S32, S64});
667 
668   if (ST.has16BitInsts()) {
669     if (ST.hasVOP3PInsts())
670       FPOpActions.legalFor({S16, V2S16});
671     else
672       FPOpActions.legalFor({S16});
673 
674     TrigActions.customFor({S16});
675     FDIVActions.customFor({S16});
676   }
677 
678   auto &MinNumMaxNum = getActionDefinitionsBuilder({
679       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680 
681   if (ST.hasVOP3PInsts()) {
682     MinNumMaxNum.customFor(FPTypesPK16)
683       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684       .clampMaxNumElements(0, S16, 2)
685       .clampScalar(0, S16, S64)
686       .scalarize(0);
687   } else if (ST.has16BitInsts()) {
688     MinNumMaxNum.customFor(FPTypes16)
689       .clampScalar(0, S16, S64)
690       .scalarize(0);
691   } else {
692     MinNumMaxNum.customFor(FPTypesBase)
693       .clampScalar(0, S32, S64)
694       .scalarize(0);
695   }
696 
697   if (ST.hasVOP3PInsts())
698     FPOpActions.clampMaxNumElements(0, S16, 2);
699 
700   FPOpActions
701     .scalarize(0)
702     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703 
704   TrigActions
705     .scalarize(0)
706     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707 
708   FDIVActions
709     .scalarize(0)
710     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 
712   getActionDefinitionsBuilder({G_FNEG, G_FABS})
713     .legalFor(FPTypesPK16)
714     .clampMaxNumElements(0, S16, 2)
715     .scalarize(0)
716     .clampScalar(0, S16, S64);
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720       .legalFor({S32, S64, S16})
721       .scalarize(0)
722       .clampScalar(0, S16, S64);
723   } else {
724     getActionDefinitionsBuilder(G_FSQRT)
725       .legalFor({S32, S64})
726       .scalarize(0)
727       .clampScalar(0, S32, S64);
728 
729     if (ST.hasFractBug()) {
730       getActionDefinitionsBuilder(G_FFLOOR)
731         .customFor({S64})
732         .legalFor({S32, S64})
733         .scalarize(0)
734         .clampScalar(0, S32, S64);
735     } else {
736       getActionDefinitionsBuilder(G_FFLOOR)
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     }
741   }
742 
743   getActionDefinitionsBuilder(G_FPTRUNC)
744     .legalFor({{S32, S64}, {S16, S32}})
745     .scalarize(0)
746     .lower();
747 
748   getActionDefinitionsBuilder(G_FPEXT)
749     .legalFor({{S64, S32}, {S32, S16}})
750     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751     .scalarize(0);
752 
753   getActionDefinitionsBuilder(G_FSUB)
754       // Use actual fsub instruction
755       .legalFor({S32})
756       // Must use fadd + fneg
757       .lowerFor({S64, S16, V2S16})
758       .scalarize(0)
759       .clampScalar(0, S32, S64);
760 
761   // Whether this is legal depends on the floating point mode for the function.
762   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764     FMad.customFor({S32, S16});
765   else if (ST.hasMadMacF32Insts())
766     FMad.customFor({S32});
767   else if (ST.hasMadF16())
768     FMad.customFor({S16});
769   FMad.scalarize(0)
770       .lower();
771 
772   auto &FRem = getActionDefinitionsBuilder(G_FREM);
773   if (ST.has16BitInsts()) {
774     FRem.customFor({S16, S32, S64});
775   } else {
776     FRem.minScalar(0, S32)
777         .customFor({S32, S64});
778   }
779   FRem.scalarize(0);
780 
781   // TODO: Do we need to clamp maximum bitwidth?
782   getActionDefinitionsBuilder(G_TRUNC)
783     .legalIf(isScalar(0))
784     .legalFor({{V2S16, V2S32}})
785     .clampMaxNumElements(0, S16, 2)
786     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787     // situations (like an invalid implicit use), we don't want to infinite loop
788     // in the legalizer.
789     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790     .alwaysLegal();
791 
792   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794                {S32, S1}, {S64, S1}, {S16, S1}})
795     .scalarize(0)
796     .clampScalar(0, S32, S64)
797     .widenScalarToNextPow2(1, 32);
798 
799   // TODO: Split s1->s64 during regbankselect for VALU.
800   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802     .lowerFor({{S32, S64}})
803     .lowerIf(typeIs(1, S1))
804     .customFor({{S64, S64}});
805   if (ST.has16BitInsts())
806     IToFP.legalFor({{S16, S16}});
807   IToFP.clampScalar(1, S32, S64)
808        .minScalar(0, S32)
809        .scalarize(0)
810        .widenScalarToNextPow2(1);
811 
812   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814     .customFor({{S64, S64}})
815     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816   if (ST.has16BitInsts())
817     FPToI.legalFor({{S16, S16}});
818   else
819     FPToI.minScalar(1, S32);
820 
821   FPToI.minScalar(0, S32)
822        .scalarize(0)
823        .lower();
824 
825   // Lower roundeven into G_FRINT
826   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
827     .scalarize(0)
828     .lower();
829 
830   if (ST.has16BitInsts()) {
831     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
832       .legalFor({S16, S32, S64})
833       .clampScalar(0, S16, S64)
834       .scalarize(0);
835   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
836     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
837       .legalFor({S32, S64})
838       .clampScalar(0, S32, S64)
839       .scalarize(0);
840   } else {
841     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842       .legalFor({S32})
843       .customFor({S64})
844       .clampScalar(0, S32, S64)
845       .scalarize(0);
846   }
847 
848   getActionDefinitionsBuilder(G_PTR_ADD)
849     .legalIf(all(isPointer(0), sameSize(0, 1)))
850     .scalarize(0)
851     .scalarSameSizeAs(1, 0);
852 
853   getActionDefinitionsBuilder(G_PTRMASK)
854     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
855     .scalarSameSizeAs(1, 0)
856     .scalarize(0);
857 
858   auto &CmpBuilder =
859     getActionDefinitionsBuilder(G_ICMP)
860     // The compare output type differs based on the register bank of the output,
861     // so make both s1 and s32 legal.
862     //
863     // Scalar compares producing output in scc will be promoted to s32, as that
864     // is the allocatable register type that will be needed for the copy from
865     // scc. This will be promoted during RegBankSelect, and we assume something
866     // before that won't try to use s32 result types.
867     //
868     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
869     // bank.
870     .legalForCartesianProduct(
871       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
872     .legalForCartesianProduct(
873       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
874   if (ST.has16BitInsts()) {
875     CmpBuilder.legalFor({{S1, S16}});
876   }
877 
878   CmpBuilder
879     .widenScalarToNextPow2(1)
880     .clampScalar(1, S32, S64)
881     .scalarize(0)
882     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
883 
884   getActionDefinitionsBuilder(G_FCMP)
885     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
886     .widenScalarToNextPow2(1)
887     .clampScalar(1, S32, S64)
888     .scalarize(0);
889 
890   // FIXME: fpow has a selection pattern that should move to custom lowering.
891   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
892   if (ST.has16BitInsts())
893     Exp2Ops.legalFor({S32, S16});
894   else
895     Exp2Ops.legalFor({S32});
896   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
897   Exp2Ops.scalarize(0);
898 
899   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
900   if (ST.has16BitInsts())
901     ExpOps.customFor({{S32}, {S16}});
902   else
903     ExpOps.customFor({S32});
904   ExpOps.clampScalar(0, MinScalarFPTy, S32)
905         .scalarize(0);
906 
907   getActionDefinitionsBuilder(G_FPOWI)
908     .clampScalar(0, MinScalarFPTy, S32)
909     .lower();
910 
911   // The 64-bit versions produce 32-bit results, but only on the SALU.
912   getActionDefinitionsBuilder(G_CTPOP)
913     .legalFor({{S32, S32}, {S32, S64}})
914     .clampScalar(0, S32, S32)
915     .clampScalar(1, S32, S64)
916     .scalarize(0)
917     .widenScalarToNextPow2(0, 32)
918     .widenScalarToNextPow2(1, 32);
919 
920   // The hardware instructions return a different result on 0 than the generic
921   // instructions expect. The hardware produces -1, but these produce the
922   // bitwidth.
923   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
924     .scalarize(0)
925     .clampScalar(0, S32, S32)
926     .clampScalar(1, S32, S64)
927     .widenScalarToNextPow2(0, 32)
928     .widenScalarToNextPow2(1, 32)
929     .lower();
930 
931   // The 64-bit versions produce 32-bit results, but only on the SALU.
932   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
933     .legalFor({{S32, S32}, {S32, S64}})
934     .clampScalar(0, S32, S32)
935     .clampScalar(1, S32, S64)
936     .scalarize(0)
937     .widenScalarToNextPow2(0, 32)
938     .widenScalarToNextPow2(1, 32);
939 
940   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
941   // RegBankSelect.
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32, S64})
944     .clampScalar(0, S32, S64)
945     .scalarize(0)
946     .widenScalarToNextPow2(0);
947 
948   if (ST.has16BitInsts()) {
949     getActionDefinitionsBuilder(G_BSWAP)
950       .legalFor({S16, S32, V2S16})
951       .clampMaxNumElements(0, S16, 2)
952       // FIXME: Fixing non-power-of-2 before clamp is workaround for
953       // narrowScalar limitation.
954       .widenScalarToNextPow2(0)
955       .clampScalar(0, S16, S32)
956       .scalarize(0);
957 
958     if (ST.hasVOP3PInsts()) {
959       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
960         .legalFor({S32, S16, V2S16})
961         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
962         .clampMaxNumElements(0, S16, 2)
963         .minScalar(0, S16)
964         .widenScalarToNextPow2(0)
965         .scalarize(0)
966         .lower();
967     } else {
968       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
969         .legalFor({S32, S16})
970         .widenScalarToNextPow2(0)
971         .minScalar(0, S16)
972         .scalarize(0)
973         .lower();
974     }
975   } else {
976     // TODO: Should have same legality without v_perm_b32
977     getActionDefinitionsBuilder(G_BSWAP)
978       .legalFor({S32})
979       .lowerIf(scalarNarrowerThan(0, 32))
980       // FIXME: Fixing non-power-of-2 before clamp is workaround for
981       // narrowScalar limitation.
982       .widenScalarToNextPow2(0)
983       .maxScalar(0, S32)
984       .scalarize(0)
985       .lower();
986 
987     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
988       .legalFor({S32})
989       .minScalar(0, S32)
990       .widenScalarToNextPow2(0)
991       .scalarize(0)
992       .lower();
993   }
994 
995   getActionDefinitionsBuilder(G_INTTOPTR)
996     // List the common cases
997     .legalForCartesianProduct(AddrSpaces64, {S64})
998     .legalForCartesianProduct(AddrSpaces32, {S32})
999     .scalarize(0)
1000     // Accept any address space as long as the size matches
1001     .legalIf(sameSize(0, 1))
1002     .widenScalarIf(smallerThan(1, 0),
1003       [](const LegalityQuery &Query) {
1004         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1005       })
1006     .narrowScalarIf(largerThan(1, 0),
1007       [](const LegalityQuery &Query) {
1008         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1009       });
1010 
1011   getActionDefinitionsBuilder(G_PTRTOINT)
1012     // List the common cases
1013     .legalForCartesianProduct(AddrSpaces64, {S64})
1014     .legalForCartesianProduct(AddrSpaces32, {S32})
1015     .scalarize(0)
1016     // Accept any address space as long as the size matches
1017     .legalIf(sameSize(0, 1))
1018     .widenScalarIf(smallerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       })
1022     .narrowScalarIf(
1023       largerThan(0, 1),
1024       [](const LegalityQuery &Query) {
1025         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1026       });
1027 
1028   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1029     .scalarize(0)
1030     .custom();
1031 
1032   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1033                                     bool IsLoad) -> bool {
1034     const LLT DstTy = Query.Types[0];
1035 
1036     // Split vector extloads.
1037     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1038     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1039 
1040     if (MemSize < DstTy.getSizeInBits())
1041       MemSize = std::max(MemSize, AlignBits);
1042 
1043     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1044       return true;
1045 
1046     const LLT PtrTy = Query.Types[1];
1047     unsigned AS = PtrTy.getAddressSpace();
1048     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1049       return true;
1050 
1051     // Catch weird sized loads that don't evenly divide into the access sizes
1052     // TODO: May be able to widen depending on alignment etc.
1053     unsigned NumRegs = (MemSize + 31) / 32;
1054     if (NumRegs == 3) {
1055       if (!ST.hasDwordx3LoadStores())
1056         return true;
1057     } else {
1058       // If the alignment allows, these should have been widened.
1059       if (!isPowerOf2_32(NumRegs))
1060         return true;
1061     }
1062 
1063     if (AlignBits < MemSize) {
1064       const SITargetLowering *TLI = ST.getTargetLowering();
1065       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1066                                                       Align(AlignBits / 8));
1067     }
1068 
1069     return false;
1070   };
1071 
1072   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1073   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1074   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1075 
1076   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1077   // LDS
1078   // TODO: Unsupported flat for SI.
1079 
1080   for (unsigned Op : {G_LOAD, G_STORE}) {
1081     const bool IsStore = Op == G_STORE;
1082 
1083     auto &Actions = getActionDefinitionsBuilder(Op);
1084     // Explicitly list some common cases.
1085     // TODO: Does this help compile time at all?
1086     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1087                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1088                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1089                                       {S64, GlobalPtr, 64, GlobalAlign32},
1090                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1091                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1092                                       {S32, GlobalPtr, 8, GlobalAlign8},
1093                                       {S32, GlobalPtr, 16, GlobalAlign16},
1094 
1095                                       {S32, LocalPtr, 32, 32},
1096                                       {S64, LocalPtr, 64, 32},
1097                                       {V2S32, LocalPtr, 64, 32},
1098                                       {S32, LocalPtr, 8, 8},
1099                                       {S32, LocalPtr, 16, 16},
1100                                       {V2S16, LocalPtr, 32, 32},
1101 
1102                                       {S32, PrivatePtr, 32, 32},
1103                                       {S32, PrivatePtr, 8, 8},
1104                                       {S32, PrivatePtr, 16, 16},
1105                                       {V2S16, PrivatePtr, 32, 32},
1106 
1107                                       {S32, ConstantPtr, 32, GlobalAlign32},
1108                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1109                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1110                                       {S64, ConstantPtr, 64, GlobalAlign32},
1111                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1112     Actions.legalIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return isLoadStoreLegal(ST, Query, Op);
1115       });
1116 
1117     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1118     // 64-bits.
1119     //
1120     // TODO: Should generalize bitcast action into coerce, which will also cover
1121     // inserting addrspacecasts.
1122     Actions.customIf(typeIs(1, Constant32Ptr));
1123 
1124     // Turn any illegal element vectors into something easier to deal
1125     // with. These will ultimately produce 32-bit scalar shifts to extract the
1126     // parts anyway.
1127     //
1128     // For odd 16-bit element vectors, prefer to split those into pieces with
1129     // 16-bit vector parts.
1130     Actions.bitcastIf(
1131       [=](const LegalityQuery &Query) -> bool {
1132         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1133                                           Query.MMODescrs[0].SizeInBits);
1134       }, bitcastToRegisterType(0));
1135 
1136     if (!IsStore) {
1137       // Widen suitably aligned loads by loading extra bytes. The standard
1138       // legalization actions can't properly express widening memory operands.
1139       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1140         return shouldWidenLoad(ST, Query, G_LOAD);
1141       });
1142     }
1143 
1144     // FIXME: load/store narrowing should be moved to lower action
1145     Actions
1146         .narrowScalarIf(
1147             [=](const LegalityQuery &Query) -> bool {
1148               return !Query.Types[0].isVector() &&
1149                      needToSplitMemOp(Query, Op == G_LOAD);
1150             },
1151             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1152               const LLT DstTy = Query.Types[0];
1153               const LLT PtrTy = Query.Types[1];
1154 
1155               const unsigned DstSize = DstTy.getSizeInBits();
1156               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1157 
1158               // Split extloads.
1159               if (DstSize > MemSize)
1160                 return std::make_pair(0, LLT::scalar(MemSize));
1161 
1162               if (!isPowerOf2_32(DstSize)) {
1163                 // We're probably decomposing an odd sized store. Try to split
1164                 // to the widest type. TODO: Account for alignment. As-is it
1165                 // should be OK, since the new parts will be further legalized.
1166                 unsigned FloorSize = PowerOf2Floor(DstSize);
1167                 return std::make_pair(0, LLT::scalar(FloorSize));
1168               }
1169 
1170               if (DstSize > 32 && (DstSize % 32 != 0)) {
1171                 // FIXME: Need a way to specify non-extload of larger size if
1172                 // suitably aligned.
1173                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1174               }
1175 
1176               unsigned MaxSize = maxSizeForAddrSpace(ST,
1177                                                      PtrTy.getAddressSpace(),
1178                                                      Op == G_LOAD);
1179               if (MemSize > MaxSize)
1180                 return std::make_pair(0, LLT::scalar(MaxSize));
1181 
1182               unsigned Align = Query.MMODescrs[0].AlignInBits;
1183               return std::make_pair(0, LLT::scalar(Align));
1184             })
1185         .fewerElementsIf(
1186             [=](const LegalityQuery &Query) -> bool {
1187               return Query.Types[0].isVector() &&
1188                      needToSplitMemOp(Query, Op == G_LOAD);
1189             },
1190             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1191               const LLT DstTy = Query.Types[0];
1192               const LLT PtrTy = Query.Types[1];
1193 
1194               LLT EltTy = DstTy.getElementType();
1195               unsigned MaxSize = maxSizeForAddrSpace(ST,
1196                                                      PtrTy.getAddressSpace(),
1197                                                      Op == G_LOAD);
1198 
1199               // FIXME: Handle widened to power of 2 results better. This ends
1200               // up scalarizing.
1201               // FIXME: 3 element stores scalarized on SI
1202 
1203               // Split if it's too large for the address space.
1204               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1205                 unsigned NumElts = DstTy.getNumElements();
1206                 unsigned EltSize = EltTy.getSizeInBits();
1207 
1208                 if (MaxSize % EltSize == 0) {
1209                   return std::make_pair(
1210                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1211                 }
1212 
1213                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1214 
1215                 // FIXME: Refine when odd breakdowns handled
1216                 // The scalars will need to be re-legalized.
1217                 if (NumPieces == 1 || NumPieces >= NumElts ||
1218                     NumElts % NumPieces != 0)
1219                   return std::make_pair(0, EltTy);
1220 
1221                 return std::make_pair(0,
1222                                       LLT::vector(NumElts / NumPieces, EltTy));
1223               }
1224 
1225               // FIXME: We could probably handle weird extending loads better.
1226               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1227               if (DstTy.getSizeInBits() > MemSize)
1228                 return std::make_pair(0, EltTy);
1229 
1230               unsigned EltSize = EltTy.getSizeInBits();
1231               unsigned DstSize = DstTy.getSizeInBits();
1232               if (!isPowerOf2_32(DstSize)) {
1233                 // We're probably decomposing an odd sized store. Try to split
1234                 // to the widest type. TODO: Account for alignment. As-is it
1235                 // should be OK, since the new parts will be further legalized.
1236                 unsigned FloorSize = PowerOf2Floor(DstSize);
1237                 return std::make_pair(
1238                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1239               }
1240 
1241               // Need to split because of alignment.
1242               unsigned Align = Query.MMODescrs[0].AlignInBits;
1243               if (EltSize > Align &&
1244                   (EltSize / Align < DstTy.getNumElements())) {
1245                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1246               }
1247 
1248               // May need relegalization for the scalars.
1249               return std::make_pair(0, EltTy);
1250             })
1251     .lowerIfMemSizeNotPow2()
1252     .minScalar(0, S32);
1253 
1254     if (IsStore)
1255       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1256 
1257     Actions
1258         .widenScalarToNextPow2(0)
1259         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1260         .lower();
1261   }
1262 
1263   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1264                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1265                                                   {S32, GlobalPtr, 16, 2 * 8},
1266                                                   {S32, LocalPtr, 8, 8},
1267                                                   {S32, LocalPtr, 16, 16},
1268                                                   {S32, PrivatePtr, 8, 8},
1269                                                   {S32, PrivatePtr, 16, 16},
1270                                                   {S32, ConstantPtr, 8, 8},
1271                                                   {S32, ConstantPtr, 16, 2 * 8}});
1272   if (ST.hasFlatAddressSpace()) {
1273     ExtLoads.legalForTypesWithMemDesc(
1274         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1275   }
1276 
1277   ExtLoads.clampScalar(0, S32, S32)
1278           .widenScalarToNextPow2(0)
1279           .unsupportedIfMemSizeNotPow2()
1280           .lower();
1281 
1282   auto &Atomics = getActionDefinitionsBuilder(
1283     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1284      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1285      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1286      G_ATOMICRMW_UMIN})
1287     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1288                {S64, GlobalPtr}, {S64, LocalPtr},
1289                {S32, RegionPtr}, {S64, RegionPtr}});
1290   if (ST.hasFlatAddressSpace()) {
1291     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1292   }
1293 
1294   if (ST.hasLDSFPAtomics()) {
1295     auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1296       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1297     if (ST.hasGFX90AInsts())
1298       Atomic.legalFor({{S64, LocalPtr}});
1299   }
1300 
1301   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1302   // demarshalling
1303   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1304     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1305                 {S32, FlatPtr}, {S64, FlatPtr}})
1306     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1307                {S32, RegionPtr}, {S64, RegionPtr}});
1308   // TODO: Pointer types, any 32-bit or 64-bit vector
1309 
1310   // Condition should be s32 for scalar, s1 for vector.
1311   getActionDefinitionsBuilder(G_SELECT)
1312     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1313           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1314           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1315     .clampScalar(0, S16, S64)
1316     .scalarize(1)
1317     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1318     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1319     .clampMaxNumElements(0, S32, 2)
1320     .clampMaxNumElements(0, LocalPtr, 2)
1321     .clampMaxNumElements(0, PrivatePtr, 2)
1322     .scalarize(0)
1323     .widenScalarToNextPow2(0)
1324     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1325 
1326   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1327   // be more flexible with the shift amount type.
1328   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1329     .legalFor({{S32, S32}, {S64, S32}});
1330   if (ST.has16BitInsts()) {
1331     if (ST.hasVOP3PInsts()) {
1332       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1333             .clampMaxNumElements(0, S16, 2);
1334     } else
1335       Shifts.legalFor({{S16, S16}});
1336 
1337     // TODO: Support 16-bit shift amounts for all types
1338     Shifts.widenScalarIf(
1339       [=](const LegalityQuery &Query) {
1340         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1341         // 32-bit amount.
1342         const LLT ValTy = Query.Types[0];
1343         const LLT AmountTy = Query.Types[1];
1344         return ValTy.getSizeInBits() <= 16 &&
1345                AmountTy.getSizeInBits() < 16;
1346       }, changeTo(1, S16));
1347     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1348     Shifts.clampScalar(1, S32, S32);
1349     Shifts.clampScalar(0, S16, S64);
1350     Shifts.widenScalarToNextPow2(0, 16);
1351 
1352     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1353       .minScalar(0, S16)
1354       .scalarize(0)
1355       .lower();
1356   } else {
1357     // Make sure we legalize the shift amount type first, as the general
1358     // expansion for the shifted type will produce much worse code if it hasn't
1359     // been truncated already.
1360     Shifts.clampScalar(1, S32, S32);
1361     Shifts.clampScalar(0, S32, S64);
1362     Shifts.widenScalarToNextPow2(0, 32);
1363 
1364     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1365       .minScalar(0, S32)
1366       .scalarize(0)
1367       .lower();
1368   }
1369   Shifts.scalarize(0);
1370 
1371   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1372     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1373     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1374     unsigned IdxTypeIdx = 2;
1375 
1376     getActionDefinitionsBuilder(Op)
1377       .customIf([=](const LegalityQuery &Query) {
1378           const LLT EltTy = Query.Types[EltTypeIdx];
1379           const LLT VecTy = Query.Types[VecTypeIdx];
1380           const LLT IdxTy = Query.Types[IdxTypeIdx];
1381           const unsigned EltSize = EltTy.getSizeInBits();
1382           return (EltSize == 32 || EltSize == 64) &&
1383                   VecTy.getSizeInBits() % 32 == 0 &&
1384                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1385                   IdxTy.getSizeInBits() == 32;
1386         })
1387       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1388                  bitcastToVectorElement32(VecTypeIdx))
1389       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1390       .bitcastIf(
1391         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1392         [=](const LegalityQuery &Query) {
1393           // For > 64-bit element types, try to turn this into a 64-bit
1394           // element vector since we may be able to do better indexing
1395           // if this is scalar. If not, fall back to 32.
1396           const LLT EltTy = Query.Types[EltTypeIdx];
1397           const LLT VecTy = Query.Types[VecTypeIdx];
1398           const unsigned DstEltSize = EltTy.getSizeInBits();
1399           const unsigned VecSize = VecTy.getSizeInBits();
1400 
1401           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1402           return std::make_pair(
1403             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1404         })
1405       .clampScalar(EltTypeIdx, S32, S64)
1406       .clampScalar(VecTypeIdx, S32, S64)
1407       .clampScalar(IdxTypeIdx, S32, S32)
1408       .clampMaxNumElements(VecTypeIdx, S32, 32)
1409       // TODO: Clamp elements for 64-bit vectors?
1410       // It should only be necessary with variable indexes.
1411       // As a last resort, lower to the stack
1412       .lower();
1413   }
1414 
1415   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416     .unsupportedIf([=](const LegalityQuery &Query) {
1417         const LLT &EltTy = Query.Types[1].getElementType();
1418         return Query.Types[0] != EltTy;
1419       });
1420 
1421   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425     // FIXME: Doesn't handle extract of illegal sizes.
1426     getActionDefinitionsBuilder(Op)
1427       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428       // FIXME: Multiples of 16 should not be legal.
1429       .legalIf([=](const LegalityQuery &Query) {
1430           const LLT BigTy = Query.Types[BigTyIdx];
1431           const LLT LitTy = Query.Types[LitTyIdx];
1432           return (BigTy.getSizeInBits() % 32 == 0) &&
1433                  (LitTy.getSizeInBits() % 16 == 0);
1434         })
1435       .widenScalarIf(
1436         [=](const LegalityQuery &Query) {
1437           const LLT BigTy = Query.Types[BigTyIdx];
1438           return (BigTy.getScalarSizeInBits() < 16);
1439         },
1440         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1441       .widenScalarIf(
1442         [=](const LegalityQuery &Query) {
1443           const LLT LitTy = Query.Types[LitTyIdx];
1444           return (LitTy.getScalarSizeInBits() < 16);
1445         },
1446         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1447       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1448       .widenScalarToNextPow2(BigTyIdx, 32);
1449 
1450   }
1451 
1452   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1453     .legalForCartesianProduct(AllS32Vectors, {S32})
1454     .legalForCartesianProduct(AllS64Vectors, {S64})
1455     .clampNumElements(0, V16S32, V32S32)
1456     .clampNumElements(0, V2S64, V16S64)
1457     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1458 
1459   if (ST.hasScalarPackInsts()) {
1460     BuildVector
1461       // FIXME: Should probably widen s1 vectors straight to s32
1462       .minScalarOrElt(0, S16)
1463       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1464       .minScalar(1, S32);
1465 
1466     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1467       .legalFor({V2S16, S32})
1468       .lower();
1469     BuildVector.minScalarOrElt(0, S32);
1470   } else {
1471     BuildVector.customFor({V2S16, S16});
1472     BuildVector.minScalarOrElt(0, S32);
1473 
1474     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1475       .customFor({V2S16, S32})
1476       .lower();
1477   }
1478 
1479   BuildVector.legalIf(isRegisterType(0));
1480 
1481   // FIXME: Clamp maximum size
1482   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1483     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1484     .clampMaxNumElements(0, S32, 32)
1485     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1486     .clampMaxNumElements(0, S16, 64);
1487 
1488   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1489   // pre-legalize.
1490   if (ST.hasVOP3PInsts()) {
1491     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1492       .customFor({V2S16, V2S16})
1493       .lower();
1494   } else
1495     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1496 
1497   // Merge/Unmerge
1498   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1499     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1500     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1501 
1502     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1503       const LLT Ty = Query.Types[TypeIdx];
1504       if (Ty.isVector()) {
1505         const LLT &EltTy = Ty.getElementType();
1506         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1507           return true;
1508         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1509           return true;
1510       }
1511       return false;
1512     };
1513 
1514     auto &Builder = getActionDefinitionsBuilder(Op)
1515       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516       .lowerFor({{S16, V2S16}})
1517       .lowerIf([=](const LegalityQuery &Query) {
1518           const LLT BigTy = Query.Types[BigTyIdx];
1519           return BigTy.getSizeInBits() == 32;
1520         })
1521       // Try to widen to s16 first for small types.
1522       // TODO: Only do this on targets with legal s16 shifts
1523       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1524       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1525       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1526       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1527                            elementTypeIs(1, S16)),
1528                        changeTo(1, V2S16))
1529       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1530       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1531       // valid.
1532       .clampScalar(LitTyIdx, S32, S512)
1533       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1534       // Break up vectors with weird elements into scalars
1535       .fewerElementsIf(
1536         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1537         scalarize(0))
1538       .fewerElementsIf(
1539         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1540         scalarize(1))
1541       .clampScalar(BigTyIdx, S32, MaxScalar);
1542 
1543     if (Op == G_MERGE_VALUES) {
1544       Builder.widenScalarIf(
1545         // TODO: Use 16-bit shifts if legal for 8-bit values?
1546         [=](const LegalityQuery &Query) {
1547           const LLT Ty = Query.Types[LitTyIdx];
1548           return Ty.getSizeInBits() < 32;
1549         },
1550         changeTo(LitTyIdx, S32));
1551     }
1552 
1553     Builder.widenScalarIf(
1554       [=](const LegalityQuery &Query) {
1555         const LLT Ty = Query.Types[BigTyIdx];
1556         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1557           Ty.getSizeInBits() % 16 != 0;
1558       },
1559       [=](const LegalityQuery &Query) {
1560         // Pick the next power of 2, or a multiple of 64 over 128.
1561         // Whichever is smaller.
1562         const LLT &Ty = Query.Types[BigTyIdx];
1563         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1564         if (NewSizeInBits >= 256) {
1565           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1566           if (RoundedTo < NewSizeInBits)
1567             NewSizeInBits = RoundedTo;
1568         }
1569         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1570       })
1571       // Any vectors left are the wrong size. Scalarize them.
1572       .scalarize(0)
1573       .scalarize(1);
1574   }
1575 
1576   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1577   // RegBankSelect.
1578   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1579     .legalFor({{S32}, {S64}});
1580 
1581   if (ST.hasVOP3PInsts()) {
1582     SextInReg.lowerFor({{V2S16}})
1583       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1584       // get more vector shift opportunities, since we'll get those when
1585       // expanded.
1586       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1587   } else if (ST.has16BitInsts()) {
1588     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1589   } else {
1590     // Prefer to promote to s32 before lowering if we don't have 16-bit
1591     // shifts. This avoid a lot of intermediate truncate and extend operations.
1592     SextInReg.lowerFor({{S32}, {S64}});
1593   }
1594 
1595   SextInReg
1596     .scalarize(0)
1597     .clampScalar(0, S32, S64)
1598     .lower();
1599 
1600   // TODO: Only Try to form v2s16 with legal packed instructions.
1601   getActionDefinitionsBuilder(G_FSHR)
1602     .legalFor({{S32, S32}})
1603     .lowerFor({{V2S16, V2S16}})
1604     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1605     .scalarize(0)
1606     .lower();
1607 
1608   if (ST.hasVOP3PInsts()) {
1609     getActionDefinitionsBuilder(G_FSHL)
1610       .lowerFor({{V2S16, V2S16}})
1611       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1612       .scalarize(0)
1613       .lower();
1614   } else {
1615     getActionDefinitionsBuilder(G_FSHL)
1616       .scalarize(0)
1617       .lower();
1618   }
1619 
1620   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1621     .legalFor({S64});
1622 
1623   getActionDefinitionsBuilder(G_FENCE)
1624     .alwaysLegal();
1625 
1626   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1627       .scalarize(0)
1628       .minScalar(0, S32)
1629       .lower();
1630 
1631   getActionDefinitionsBuilder({
1632       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1633       G_FCOPYSIGN,
1634 
1635       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1636       G_ATOMICRMW_NAND,
1637       G_ATOMICRMW_FSUB,
1638       G_READ_REGISTER,
1639       G_WRITE_REGISTER,
1640 
1641       G_SADDO, G_SSUBO,
1642 
1643        // TODO: Implement
1644       G_FMINIMUM, G_FMAXIMUM}).lower();
1645 
1646   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1647         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1648         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1649     .unsupported();
1650 
1651   computeTables();
1652   verify(*ST.getInstrInfo());
1653 }
1654 
1655 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1656                                          MachineInstr &MI) const {
1657   MachineIRBuilder &B = Helper.MIRBuilder;
1658   MachineRegisterInfo &MRI = *B.getMRI();
1659 
1660   switch (MI.getOpcode()) {
1661   case TargetOpcode::G_ADDRSPACE_CAST:
1662     return legalizeAddrSpaceCast(MI, MRI, B);
1663   case TargetOpcode::G_FRINT:
1664     return legalizeFrint(MI, MRI, B);
1665   case TargetOpcode::G_FCEIL:
1666     return legalizeFceil(MI, MRI, B);
1667   case TargetOpcode::G_FREM:
1668     return legalizeFrem(MI, MRI, B);
1669   case TargetOpcode::G_INTRINSIC_TRUNC:
1670     return legalizeIntrinsicTrunc(MI, MRI, B);
1671   case TargetOpcode::G_SITOFP:
1672     return legalizeITOFP(MI, MRI, B, true);
1673   case TargetOpcode::G_UITOFP:
1674     return legalizeITOFP(MI, MRI, B, false);
1675   case TargetOpcode::G_FPTOSI:
1676     return legalizeFPTOI(MI, MRI, B, true);
1677   case TargetOpcode::G_FPTOUI:
1678     return legalizeFPTOI(MI, MRI, B, false);
1679   case TargetOpcode::G_FMINNUM:
1680   case TargetOpcode::G_FMAXNUM:
1681   case TargetOpcode::G_FMINNUM_IEEE:
1682   case TargetOpcode::G_FMAXNUM_IEEE:
1683     return legalizeMinNumMaxNum(Helper, MI);
1684   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1685     return legalizeExtractVectorElt(MI, MRI, B);
1686   case TargetOpcode::G_INSERT_VECTOR_ELT:
1687     return legalizeInsertVectorElt(MI, MRI, B);
1688   case TargetOpcode::G_SHUFFLE_VECTOR:
1689     return legalizeShuffleVector(MI, MRI, B);
1690   case TargetOpcode::G_FSIN:
1691   case TargetOpcode::G_FCOS:
1692     return legalizeSinCos(MI, MRI, B);
1693   case TargetOpcode::G_GLOBAL_VALUE:
1694     return legalizeGlobalValue(MI, MRI, B);
1695   case TargetOpcode::G_LOAD:
1696     return legalizeLoad(Helper, MI);
1697   case TargetOpcode::G_FMAD:
1698     return legalizeFMad(MI, MRI, B);
1699   case TargetOpcode::G_FDIV:
1700     return legalizeFDIV(MI, MRI, B);
1701   case TargetOpcode::G_UDIV:
1702   case TargetOpcode::G_UREM:
1703     return legalizeUDIV_UREM(MI, MRI, B);
1704   case TargetOpcode::G_SDIV:
1705   case TargetOpcode::G_SREM:
1706     return legalizeSDIV_SREM(MI, MRI, B);
1707   case TargetOpcode::G_ATOMIC_CMPXCHG:
1708     return legalizeAtomicCmpXChg(MI, MRI, B);
1709   case TargetOpcode::G_FLOG:
1710     return legalizeFlog(MI, B, numbers::ln2f);
1711   case TargetOpcode::G_FLOG10:
1712     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1713   case TargetOpcode::G_FEXP:
1714     return legalizeFExp(MI, B);
1715   case TargetOpcode::G_FPOW:
1716     return legalizeFPow(MI, B);
1717   case TargetOpcode::G_FFLOOR:
1718     return legalizeFFloor(MI, MRI, B);
1719   case TargetOpcode::G_BUILD_VECTOR:
1720     return legalizeBuildVector(MI, MRI, B);
1721   default:
1722     return false;
1723   }
1724 
1725   llvm_unreachable("expected switch to return");
1726 }
1727 
1728 Register AMDGPULegalizerInfo::getSegmentAperture(
1729   unsigned AS,
1730   MachineRegisterInfo &MRI,
1731   MachineIRBuilder &B) const {
1732   MachineFunction &MF = B.getMF();
1733   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1734   const LLT S32 = LLT::scalar(32);
1735 
1736   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1737 
1738   if (ST.hasApertureRegs()) {
1739     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1740     // getreg.
1741     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1742         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1743         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1744     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1745         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1746         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1747     unsigned Encoding =
1748         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1749         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1750         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1751 
1752     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1753 
1754     B.buildInstr(AMDGPU::S_GETREG_B32)
1755       .addDef(GetReg)
1756       .addImm(Encoding);
1757     MRI.setType(GetReg, S32);
1758 
1759     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1760     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1761   }
1762 
1763   Register QueuePtr = MRI.createGenericVirtualRegister(
1764     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1765 
1766   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1767     return Register();
1768 
1769   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1770   // private_segment_aperture_base_hi.
1771   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1772 
1773   // TODO: can we be smarter about machine pointer info?
1774   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1775   MachineMemOperand *MMO = MF.getMachineMemOperand(
1776       PtrInfo,
1777       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1778           MachineMemOperand::MOInvariant,
1779       4, commonAlignment(Align(64), StructOffset));
1780 
1781   Register LoadAddr;
1782 
1783   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1784   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1785 }
1786 
1787 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1788   MachineInstr &MI, MachineRegisterInfo &MRI,
1789   MachineIRBuilder &B) const {
1790   MachineFunction &MF = B.getMF();
1791 
1792   const LLT S32 = LLT::scalar(32);
1793   Register Dst = MI.getOperand(0).getReg();
1794   Register Src = MI.getOperand(1).getReg();
1795 
1796   LLT DstTy = MRI.getType(Dst);
1797   LLT SrcTy = MRI.getType(Src);
1798   unsigned DestAS = DstTy.getAddressSpace();
1799   unsigned SrcAS = SrcTy.getAddressSpace();
1800 
1801   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1802   // vector element.
1803   assert(!DstTy.isVector());
1804 
1805   const AMDGPUTargetMachine &TM
1806     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1807 
1808   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1809     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1810     return true;
1811   }
1812 
1813   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1814     // Truncate.
1815     B.buildExtract(Dst, Src, 0);
1816     MI.eraseFromParent();
1817     return true;
1818   }
1819 
1820   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1821     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1822     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1823 
1824     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1825     // another. Merge operands are required to be the same type, but creating an
1826     // extra ptrtoint would be kind of pointless.
1827     auto HighAddr = B.buildConstant(
1828       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1829     B.buildMerge(Dst, {Src, HighAddr});
1830     MI.eraseFromParent();
1831     return true;
1832   }
1833 
1834   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1835     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1836            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1837     unsigned NullVal = TM.getNullPointerValue(DestAS);
1838 
1839     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1840     auto FlatNull = B.buildConstant(SrcTy, 0);
1841 
1842     // Extract low 32-bits of the pointer.
1843     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1844 
1845     auto CmpRes =
1846         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1847     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1848 
1849     MI.eraseFromParent();
1850     return true;
1851   }
1852 
1853   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1854     return false;
1855 
1856   if (!ST.hasFlatAddressSpace())
1857     return false;
1858 
1859   auto SegmentNull =
1860       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1861   auto FlatNull =
1862       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1863 
1864   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1865   if (!ApertureReg.isValid())
1866     return false;
1867 
1868   auto CmpRes =
1869       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1870 
1871   // Coerce the type of the low half of the result so we can use merge_values.
1872   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1873 
1874   // TODO: Should we allow mismatched types but matching sizes in merges to
1875   // avoid the ptrtoint?
1876   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1877   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1878 
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeFrint(
1884   MachineInstr &MI, MachineRegisterInfo &MRI,
1885   MachineIRBuilder &B) const {
1886   Register Src = MI.getOperand(1).getReg();
1887   LLT Ty = MRI.getType(Src);
1888   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1889 
1890   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1891   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1892 
1893   auto C1 = B.buildFConstant(Ty, C1Val);
1894   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1895 
1896   // TODO: Should this propagate fast-math-flags?
1897   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1898   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1899 
1900   auto C2 = B.buildFConstant(Ty, C2Val);
1901   auto Fabs = B.buildFAbs(Ty, Src);
1902 
1903   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1904   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1905   MI.eraseFromParent();
1906   return true;
1907 }
1908 
1909 bool AMDGPULegalizerInfo::legalizeFceil(
1910   MachineInstr &MI, MachineRegisterInfo &MRI,
1911   MachineIRBuilder &B) const {
1912 
1913   const LLT S1 = LLT::scalar(1);
1914   const LLT S64 = LLT::scalar(64);
1915 
1916   Register Src = MI.getOperand(1).getReg();
1917   assert(MRI.getType(Src) == S64);
1918 
1919   // result = trunc(src)
1920   // if (src > 0.0 && src != result)
1921   //   result += 1.0
1922 
1923   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1924 
1925   const auto Zero = B.buildFConstant(S64, 0.0);
1926   const auto One = B.buildFConstant(S64, 1.0);
1927   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1928   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1929   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1930   auto Add = B.buildSelect(S64, And, One, Zero);
1931 
1932   // TODO: Should this propagate fast-math-flags?
1933   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1934   return true;
1935 }
1936 
1937 bool AMDGPULegalizerInfo::legalizeFrem(
1938   MachineInstr &MI, MachineRegisterInfo &MRI,
1939   MachineIRBuilder &B) const {
1940     Register DstReg = MI.getOperand(0).getReg();
1941     Register Src0Reg = MI.getOperand(1).getReg();
1942     Register Src1Reg = MI.getOperand(2).getReg();
1943     auto Flags = MI.getFlags();
1944     LLT Ty = MRI.getType(DstReg);
1945 
1946     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1947     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1948     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1949     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1950     MI.eraseFromParent();
1951     return true;
1952 }
1953 
1954 static MachineInstrBuilder extractF64Exponent(Register Hi,
1955                                               MachineIRBuilder &B) {
1956   const unsigned FractBits = 52;
1957   const unsigned ExpBits = 11;
1958   LLT S32 = LLT::scalar(32);
1959 
1960   auto Const0 = B.buildConstant(S32, FractBits - 32);
1961   auto Const1 = B.buildConstant(S32, ExpBits);
1962 
1963   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1964     .addUse(Hi)
1965     .addUse(Const0.getReg(0))
1966     .addUse(Const1.getReg(0));
1967 
1968   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1969 }
1970 
1971 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1972   MachineInstr &MI, MachineRegisterInfo &MRI,
1973   MachineIRBuilder &B) const {
1974   const LLT S1 = LLT::scalar(1);
1975   const LLT S32 = LLT::scalar(32);
1976   const LLT S64 = LLT::scalar(64);
1977 
1978   Register Src = MI.getOperand(1).getReg();
1979   assert(MRI.getType(Src) == S64);
1980 
1981   // TODO: Should this use extract since the low half is unused?
1982   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1983   Register Hi = Unmerge.getReg(1);
1984 
1985   // Extract the upper half, since this is where we will find the sign and
1986   // exponent.
1987   auto Exp = extractF64Exponent(Hi, B);
1988 
1989   const unsigned FractBits = 52;
1990 
1991   // Extract the sign bit.
1992   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1993   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1994 
1995   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1996 
1997   const auto Zero32 = B.buildConstant(S32, 0);
1998 
1999   // Extend back to 64-bits.
2000   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2001 
2002   auto Shr = B.buildAShr(S64, FractMask, Exp);
2003   auto Not = B.buildNot(S64, Shr);
2004   auto Tmp0 = B.buildAnd(S64, Src, Not);
2005   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2006 
2007   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2008   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2009 
2010   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2011   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2012   MI.eraseFromParent();
2013   return true;
2014 }
2015 
2016 bool AMDGPULegalizerInfo::legalizeITOFP(
2017   MachineInstr &MI, MachineRegisterInfo &MRI,
2018   MachineIRBuilder &B, bool Signed) const {
2019 
2020   Register Dst = MI.getOperand(0).getReg();
2021   Register Src = MI.getOperand(1).getReg();
2022 
2023   const LLT S64 = LLT::scalar(64);
2024   const LLT S32 = LLT::scalar(32);
2025 
2026   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2027 
2028   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2029 
2030   auto CvtHi = Signed ?
2031     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2032     B.buildUITOFP(S64, Unmerge.getReg(1));
2033 
2034   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2035 
2036   auto ThirtyTwo = B.buildConstant(S32, 32);
2037   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2038     .addUse(CvtHi.getReg(0))
2039     .addUse(ThirtyTwo.getReg(0));
2040 
2041   // TODO: Should this propagate fast-math-flags?
2042   B.buildFAdd(Dst, LdExp, CvtLo);
2043   MI.eraseFromParent();
2044   return true;
2045 }
2046 
2047 // TODO: Copied from DAG implementation. Verify logic and document how this
2048 // actually works.
2049 bool AMDGPULegalizerInfo::legalizeFPTOI(
2050   MachineInstr &MI, MachineRegisterInfo &MRI,
2051   MachineIRBuilder &B, bool Signed) const {
2052 
2053   Register Dst = MI.getOperand(0).getReg();
2054   Register Src = MI.getOperand(1).getReg();
2055 
2056   const LLT S64 = LLT::scalar(64);
2057   const LLT S32 = LLT::scalar(32);
2058 
2059   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2060 
2061   unsigned Flags = MI.getFlags();
2062 
2063   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2064   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2065   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2066 
2067   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2068   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2069   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2070 
2071   auto Hi = Signed ?
2072     B.buildFPTOSI(S32, FloorMul) :
2073     B.buildFPTOUI(S32, FloorMul);
2074   auto Lo = B.buildFPTOUI(S32, Fma);
2075 
2076   B.buildMerge(Dst, { Lo, Hi });
2077   MI.eraseFromParent();
2078 
2079   return true;
2080 }
2081 
2082 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2083                                                MachineInstr &MI) const {
2084   MachineFunction &MF = Helper.MIRBuilder.getMF();
2085   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2086 
2087   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2088                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2089 
2090   // With ieee_mode disabled, the instructions have the correct behavior
2091   // already for G_FMINNUM/G_FMAXNUM
2092   if (!MFI->getMode().IEEE)
2093     return !IsIEEEOp;
2094 
2095   if (IsIEEEOp)
2096     return true;
2097 
2098   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2099 }
2100 
2101 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2102   MachineInstr &MI, MachineRegisterInfo &MRI,
2103   MachineIRBuilder &B) const {
2104   // TODO: Should move some of this into LegalizerHelper.
2105 
2106   // TODO: Promote dynamic indexing of s16 to s32
2107 
2108   // FIXME: Artifact combiner probably should have replaced the truncated
2109   // constant before this, so we shouldn't need
2110   // getConstantVRegValWithLookThrough.
2111   Optional<ValueAndVReg> MaybeIdxVal =
2112       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2113   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2114     return true;
2115   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2116 
2117   Register Dst = MI.getOperand(0).getReg();
2118   Register Vec = MI.getOperand(1).getReg();
2119 
2120   LLT VecTy = MRI.getType(Vec);
2121   LLT EltTy = VecTy.getElementType();
2122   assert(EltTy == MRI.getType(Dst));
2123 
2124   if (IdxVal < VecTy.getNumElements())
2125     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2126   else
2127     B.buildUndef(Dst);
2128 
2129   MI.eraseFromParent();
2130   return true;
2131 }
2132 
2133 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2134   MachineInstr &MI, MachineRegisterInfo &MRI,
2135   MachineIRBuilder &B) const {
2136   // TODO: Should move some of this into LegalizerHelper.
2137 
2138   // TODO: Promote dynamic indexing of s16 to s32
2139 
2140   // FIXME: Artifact combiner probably should have replaced the truncated
2141   // constant before this, so we shouldn't need
2142   // getConstantVRegValWithLookThrough.
2143   Optional<ValueAndVReg> MaybeIdxVal =
2144       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2145   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2146     return true;
2147 
2148   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2149   Register Dst = MI.getOperand(0).getReg();
2150   Register Vec = MI.getOperand(1).getReg();
2151   Register Ins = MI.getOperand(2).getReg();
2152 
2153   LLT VecTy = MRI.getType(Vec);
2154   LLT EltTy = VecTy.getElementType();
2155   assert(EltTy == MRI.getType(Ins));
2156 
2157   if (IdxVal < VecTy.getNumElements())
2158     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2159   else
2160     B.buildUndef(Dst);
2161 
2162   MI.eraseFromParent();
2163   return true;
2164 }
2165 
2166 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2167   MachineInstr &MI, MachineRegisterInfo &MRI,
2168   MachineIRBuilder &B) const {
2169   const LLT V2S16 = LLT::vector(2, 16);
2170 
2171   Register Dst = MI.getOperand(0).getReg();
2172   Register Src0 = MI.getOperand(1).getReg();
2173   LLT DstTy = MRI.getType(Dst);
2174   LLT SrcTy = MRI.getType(Src0);
2175 
2176   if (SrcTy == V2S16 && DstTy == V2S16 &&
2177       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2178     return true;
2179 
2180   MachineIRBuilder HelperBuilder(MI);
2181   GISelObserverWrapper DummyObserver;
2182   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2183   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2184 }
2185 
2186 bool AMDGPULegalizerInfo::legalizeSinCos(
2187   MachineInstr &MI, MachineRegisterInfo &MRI,
2188   MachineIRBuilder &B) const {
2189 
2190   Register DstReg = MI.getOperand(0).getReg();
2191   Register SrcReg = MI.getOperand(1).getReg();
2192   LLT Ty = MRI.getType(DstReg);
2193   unsigned Flags = MI.getFlags();
2194 
2195   Register TrigVal;
2196   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2197   if (ST.hasTrigReducedRange()) {
2198     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2199     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2200       .addUse(MulVal.getReg(0))
2201       .setMIFlags(Flags).getReg(0);
2202   } else
2203     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2204 
2205   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2206     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2207   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2208     .addUse(TrigVal)
2209     .setMIFlags(Flags);
2210   MI.eraseFromParent();
2211   return true;
2212 }
2213 
2214 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2215                                                   MachineIRBuilder &B,
2216                                                   const GlobalValue *GV,
2217                                                   int64_t Offset,
2218                                                   unsigned GAFlags) const {
2219   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2220   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2221   // to the following code sequence:
2222   //
2223   // For constant address space:
2224   //   s_getpc_b64 s[0:1]
2225   //   s_add_u32 s0, s0, $symbol
2226   //   s_addc_u32 s1, s1, 0
2227   //
2228   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2229   //   a fixup or relocation is emitted to replace $symbol with a literal
2230   //   constant, which is a pc-relative offset from the encoding of the $symbol
2231   //   operand to the global variable.
2232   //
2233   // For global address space:
2234   //   s_getpc_b64 s[0:1]
2235   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2236   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2237   //
2238   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2239   //   fixups or relocations are emitted to replace $symbol@*@lo and
2240   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2241   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2242   //   operand to the global variable.
2243   //
2244   // What we want here is an offset from the value returned by s_getpc
2245   // (which is the address of the s_add_u32 instruction) to the global
2246   // variable, but since the encoding of $symbol starts 4 bytes after the start
2247   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2248   // small. This requires us to add 4 to the global variable offset in order to
2249   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2250   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2251   // instruction.
2252 
2253   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2254 
2255   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2256     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2257 
2258   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2259     .addDef(PCReg);
2260 
2261   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2262   if (GAFlags == SIInstrInfo::MO_NONE)
2263     MIB.addImm(0);
2264   else
2265     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2266 
2267   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2268 
2269   if (PtrTy.getSizeInBits() == 32)
2270     B.buildExtract(DstReg, PCReg, 0);
2271   return true;
2272  }
2273 
2274 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2275   MachineInstr &MI, MachineRegisterInfo &MRI,
2276   MachineIRBuilder &B) const {
2277   Register DstReg = MI.getOperand(0).getReg();
2278   LLT Ty = MRI.getType(DstReg);
2279   unsigned AS = Ty.getAddressSpace();
2280 
2281   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2282   MachineFunction &MF = B.getMF();
2283   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2284 
2285   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2286     if (!MFI->isModuleEntryFunction()) {
2287       const Function &Fn = MF.getFunction();
2288       DiagnosticInfoUnsupported BadLDSDecl(
2289         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2290         DS_Warning);
2291       Fn.getContext().diagnose(BadLDSDecl);
2292 
2293       // We currently don't have a way to correctly allocate LDS objects that
2294       // aren't directly associated with a kernel. We do force inlining of
2295       // functions that use local objects. However, if these dead functions are
2296       // not eliminated, we don't want a compile time error. Just emit a warning
2297       // and a trap, since there should be no callable path here.
2298       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2299       B.buildUndef(DstReg);
2300       MI.eraseFromParent();
2301       return true;
2302     }
2303 
2304     // TODO: We could emit code to handle the initialization somewhere.
2305     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2306       const SITargetLowering *TLI = ST.getTargetLowering();
2307       if (!TLI->shouldUseLDSConstAddress(GV)) {
2308         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2309         return true; // Leave in place;
2310       }
2311 
2312       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2313         Type *Ty = GV->getValueType();
2314         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2315         // zero-sized type in other languages to declare the dynamic shared
2316         // memory which size is not known at the compile time. They will be
2317         // allocated by the runtime and placed directly after the static
2318         // allocated ones. They all share the same offset.
2319         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2320           // Adjust alignment for that dynamic shared memory array.
2321           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2322           LLT S32 = LLT::scalar(32);
2323           auto Sz =
2324               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2325           B.buildIntToPtr(DstReg, Sz);
2326           MI.eraseFromParent();
2327           return true;
2328         }
2329       }
2330 
2331       B.buildConstant(
2332           DstReg,
2333           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2334       MI.eraseFromParent();
2335       return true;
2336     }
2337 
2338     const Function &Fn = MF.getFunction();
2339     DiagnosticInfoUnsupported BadInit(
2340       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2341     Fn.getContext().diagnose(BadInit);
2342     return true;
2343   }
2344 
2345   const SITargetLowering *TLI = ST.getTargetLowering();
2346 
2347   if (TLI->shouldEmitFixup(GV)) {
2348     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2349     MI.eraseFromParent();
2350     return true;
2351   }
2352 
2353   if (TLI->shouldEmitPCReloc(GV)) {
2354     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2355     MI.eraseFromParent();
2356     return true;
2357   }
2358 
2359   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2360   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2361 
2362   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2363       MachinePointerInfo::getGOT(MF),
2364       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2365           MachineMemOperand::MOInvariant,
2366       8 /*Size*/, Align(8));
2367 
2368   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2369 
2370   if (Ty.getSizeInBits() == 32) {
2371     // Truncate if this is a 32-bit constant adrdess.
2372     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2373     B.buildExtract(DstReg, Load, 0);
2374   } else
2375     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2376 
2377   MI.eraseFromParent();
2378   return true;
2379 }
2380 
2381 static LLT widenToNextPowerOf2(LLT Ty) {
2382   if (Ty.isVector())
2383     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2384   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2385 }
2386 
2387 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2388                                        MachineInstr &MI) const {
2389   MachineIRBuilder &B = Helper.MIRBuilder;
2390   MachineRegisterInfo &MRI = *B.getMRI();
2391   GISelChangeObserver &Observer = Helper.Observer;
2392 
2393   Register PtrReg = MI.getOperand(1).getReg();
2394   LLT PtrTy = MRI.getType(PtrReg);
2395   unsigned AddrSpace = PtrTy.getAddressSpace();
2396 
2397   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2398     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2399     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2400     Observer.changingInstr(MI);
2401     MI.getOperand(1).setReg(Cast.getReg(0));
2402     Observer.changedInstr(MI);
2403     return true;
2404   }
2405 
2406   Register ValReg = MI.getOperand(0).getReg();
2407   LLT ValTy = MRI.getType(ValReg);
2408 
2409   MachineMemOperand *MMO = *MI.memoperands_begin();
2410   const unsigned ValSize = ValTy.getSizeInBits();
2411   const unsigned MemSize = 8 * MMO->getSize();
2412   const Align MemAlign = MMO->getAlign();
2413   const unsigned AlignInBits = 8 * MemAlign.value();
2414 
2415   // Widen non-power-of-2 loads to the alignment if needed
2416   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2417     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2418 
2419     // This was already the correct extending load result type, so just adjust
2420     // the memory type.
2421     if (WideMemSize == ValSize) {
2422       MachineFunction &MF = B.getMF();
2423 
2424       MachineMemOperand *WideMMO =
2425           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2426       Observer.changingInstr(MI);
2427       MI.setMemRefs(MF, {WideMMO});
2428       Observer.changedInstr(MI);
2429       return true;
2430     }
2431 
2432     // Don't bother handling edge case that should probably never be produced.
2433     if (ValSize > WideMemSize)
2434       return false;
2435 
2436     LLT WideTy = widenToNextPowerOf2(ValTy);
2437 
2438     Register WideLoad;
2439     if (!WideTy.isVector()) {
2440       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2441       B.buildTrunc(ValReg, WideLoad).getReg(0);
2442     } else {
2443       // Extract the subvector.
2444 
2445       if (isRegisterType(ValTy)) {
2446         // If this a case where G_EXTRACT is legal, use it.
2447         // (e.g. <3 x s32> -> <4 x s32>)
2448         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2449         B.buildExtract(ValReg, WideLoad, 0);
2450       } else {
2451         // For cases where the widened type isn't a nice register value, unmerge
2452         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2453         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2454         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2455         B.setInsertPt(B.getMBB(), MI.getIterator());
2456         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2457       }
2458     }
2459 
2460     MI.eraseFromParent();
2461     return true;
2462   }
2463 
2464   return false;
2465 }
2466 
2467 bool AMDGPULegalizerInfo::legalizeFMad(
2468   MachineInstr &MI, MachineRegisterInfo &MRI,
2469   MachineIRBuilder &B) const {
2470   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2471   assert(Ty.isScalar());
2472 
2473   MachineFunction &MF = B.getMF();
2474   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2475 
2476   // TODO: Always legal with future ftz flag.
2477   // FIXME: Do we need just output?
2478   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2479     return true;
2480   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2481     return true;
2482 
2483   MachineIRBuilder HelperBuilder(MI);
2484   GISelObserverWrapper DummyObserver;
2485   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2486   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2490   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2491   Register DstReg = MI.getOperand(0).getReg();
2492   Register PtrReg = MI.getOperand(1).getReg();
2493   Register CmpVal = MI.getOperand(2).getReg();
2494   Register NewVal = MI.getOperand(3).getReg();
2495 
2496   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2497          "this should not have been custom lowered");
2498 
2499   LLT ValTy = MRI.getType(CmpVal);
2500   LLT VecTy = LLT::vector(2, ValTy);
2501 
2502   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2503 
2504   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2505     .addDef(DstReg)
2506     .addUse(PtrReg)
2507     .addUse(PackedVal)
2508     .setMemRefs(MI.memoperands());
2509 
2510   MI.eraseFromParent();
2511   return true;
2512 }
2513 
2514 bool AMDGPULegalizerInfo::legalizeFlog(
2515   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2516   Register Dst = MI.getOperand(0).getReg();
2517   Register Src = MI.getOperand(1).getReg();
2518   LLT Ty = B.getMRI()->getType(Dst);
2519   unsigned Flags = MI.getFlags();
2520 
2521   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2522   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2523 
2524   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2525   MI.eraseFromParent();
2526   return true;
2527 }
2528 
2529 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2530                                        MachineIRBuilder &B) const {
2531   Register Dst = MI.getOperand(0).getReg();
2532   Register Src = MI.getOperand(1).getReg();
2533   unsigned Flags = MI.getFlags();
2534   LLT Ty = B.getMRI()->getType(Dst);
2535 
2536   auto K = B.buildFConstant(Ty, numbers::log2e);
2537   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2538   B.buildFExp2(Dst, Mul, Flags);
2539   MI.eraseFromParent();
2540   return true;
2541 }
2542 
2543 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2544                                        MachineIRBuilder &B) const {
2545   Register Dst = MI.getOperand(0).getReg();
2546   Register Src0 = MI.getOperand(1).getReg();
2547   Register Src1 = MI.getOperand(2).getReg();
2548   unsigned Flags = MI.getFlags();
2549   LLT Ty = B.getMRI()->getType(Dst);
2550   const LLT S16 = LLT::scalar(16);
2551   const LLT S32 = LLT::scalar(32);
2552 
2553   if (Ty == S32) {
2554     auto Log = B.buildFLog2(S32, Src0, Flags);
2555     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2556       .addUse(Log.getReg(0))
2557       .addUse(Src1)
2558       .setMIFlags(Flags);
2559     B.buildFExp2(Dst, Mul, Flags);
2560   } else if (Ty == S16) {
2561     // There's no f16 fmul_legacy, so we need to convert for it.
2562     auto Log = B.buildFLog2(S16, Src0, Flags);
2563     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2564     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2565     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2566       .addUse(Ext0.getReg(0))
2567       .addUse(Ext1.getReg(0))
2568       .setMIFlags(Flags);
2569 
2570     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2571   } else
2572     return false;
2573 
2574   MI.eraseFromParent();
2575   return true;
2576 }
2577 
2578 // Find a source register, ignoring any possible source modifiers.
2579 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2580   Register ModSrc = OrigSrc;
2581   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2582     ModSrc = SrcFNeg->getOperand(1).getReg();
2583     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2584       ModSrc = SrcFAbs->getOperand(1).getReg();
2585   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2586     ModSrc = SrcFAbs->getOperand(1).getReg();
2587   return ModSrc;
2588 }
2589 
2590 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2591                                          MachineRegisterInfo &MRI,
2592                                          MachineIRBuilder &B) const {
2593 
2594   const LLT S1 = LLT::scalar(1);
2595   const LLT S64 = LLT::scalar(64);
2596   Register Dst = MI.getOperand(0).getReg();
2597   Register OrigSrc = MI.getOperand(1).getReg();
2598   unsigned Flags = MI.getFlags();
2599   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2600          "this should not have been custom lowered");
2601 
2602   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2603   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2604   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2605   // V_FRACT bug is:
2606   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2607   //
2608   // Convert floor(x) to (x - fract(x))
2609 
2610   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2611     .addUse(OrigSrc)
2612     .setMIFlags(Flags);
2613 
2614   // Give source modifier matching some assistance before obscuring a foldable
2615   // pattern.
2616 
2617   // TODO: We can avoid the neg on the fract? The input sign to fract
2618   // shouldn't matter?
2619   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2620 
2621   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2622 
2623   Register Min = MRI.createGenericVirtualRegister(S64);
2624 
2625   // We don't need to concern ourselves with the snan handling difference, so
2626   // use the one which will directly select.
2627   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2628   if (MFI->getMode().IEEE)
2629     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2630   else
2631     B.buildFMinNum(Min, Fract, Const, Flags);
2632 
2633   Register CorrectedFract = Min;
2634   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2635     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2636     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2637   }
2638 
2639   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2640   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2641 
2642   MI.eraseFromParent();
2643   return true;
2644 }
2645 
2646 // Turn an illegal packed v2s16 build vector into bit operations.
2647 // TODO: This should probably be a bitcast action in LegalizerHelper.
2648 bool AMDGPULegalizerInfo::legalizeBuildVector(
2649   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2650   Register Dst = MI.getOperand(0).getReg();
2651   const LLT S32 = LLT::scalar(32);
2652   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2653 
2654   Register Src0 = MI.getOperand(1).getReg();
2655   Register Src1 = MI.getOperand(2).getReg();
2656   assert(MRI.getType(Src0) == LLT::scalar(16));
2657 
2658   auto Merge = B.buildMerge(S32, {Src0, Src1});
2659   B.buildBitcast(Dst, Merge);
2660 
2661   MI.eraseFromParent();
2662   return true;
2663 }
2664 
2665 // Check that this is a G_XOR x, -1
2666 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2667   if (MI.getOpcode() != TargetOpcode::G_XOR)
2668     return false;
2669   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2670   return ConstVal && *ConstVal == -1;
2671 }
2672 
2673 // Return the use branch instruction, otherwise null if the usage is invalid.
2674 static MachineInstr *
2675 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2676                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2677   Register CondDef = MI.getOperand(0).getReg();
2678   if (!MRI.hasOneNonDBGUse(CondDef))
2679     return nullptr;
2680 
2681   MachineBasicBlock *Parent = MI.getParent();
2682   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2683 
2684   if (isNot(MRI, *UseMI)) {
2685     Register NegatedCond = UseMI->getOperand(0).getReg();
2686     if (!MRI.hasOneNonDBGUse(NegatedCond))
2687       return nullptr;
2688 
2689     // We're deleting the def of this value, so we need to remove it.
2690     UseMI->eraseFromParent();
2691 
2692     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2693     Negated = true;
2694   }
2695 
2696   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2697     return nullptr;
2698 
2699   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2700   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2701   if (Next == Parent->end()) {
2702     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2703     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2704       return nullptr;
2705     UncondBrTarget = &*NextMBB;
2706   } else {
2707     if (Next->getOpcode() != AMDGPU::G_BR)
2708       return nullptr;
2709     Br = &*Next;
2710     UncondBrTarget = Br->getOperand(0).getMBB();
2711   }
2712 
2713   return UseMI;
2714 }
2715 
2716 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2717                                          const ArgDescriptor *Arg,
2718                                          const TargetRegisterClass *ArgRC,
2719                                          LLT ArgTy) const {
2720   MCRegister SrcReg = Arg->getRegister();
2721   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2722   assert(DstReg.isVirtual() && "Virtual register expected");
2723 
2724   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2725                                              ArgTy);
2726   if (Arg->isMasked()) {
2727     // TODO: Should we try to emit this once in the entry block?
2728     const LLT S32 = LLT::scalar(32);
2729     const unsigned Mask = Arg->getMask();
2730     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2731 
2732     Register AndMaskSrc = LiveIn;
2733 
2734     if (Shift != 0) {
2735       auto ShiftAmt = B.buildConstant(S32, Shift);
2736       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2737     }
2738 
2739     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2740   } else {
2741     B.buildCopy(DstReg, LiveIn);
2742   }
2743 
2744   return true;
2745 }
2746 
2747 bool AMDGPULegalizerInfo::loadInputValue(
2748     Register DstReg, MachineIRBuilder &B,
2749     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2750   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2751   const ArgDescriptor *Arg;
2752   const TargetRegisterClass *ArgRC;
2753   LLT ArgTy;
2754   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2755 
2756   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2757     return false; // TODO: Handle these
2758   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2759 }
2760 
2761 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2762     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2763     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2764   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2765     return false;
2766 
2767   MI.eraseFromParent();
2768   return true;
2769 }
2770 
2771 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2772                                        MachineRegisterInfo &MRI,
2773                                        MachineIRBuilder &B) const {
2774   Register Dst = MI.getOperand(0).getReg();
2775   LLT DstTy = MRI.getType(Dst);
2776   LLT S16 = LLT::scalar(16);
2777   LLT S32 = LLT::scalar(32);
2778   LLT S64 = LLT::scalar(64);
2779 
2780   if (DstTy == S16)
2781     return legalizeFDIV16(MI, MRI, B);
2782   if (DstTy == S32)
2783     return legalizeFDIV32(MI, MRI, B);
2784   if (DstTy == S64)
2785     return legalizeFDIV64(MI, MRI, B);
2786 
2787   return false;
2788 }
2789 
2790 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2791                                                   Register DstReg,
2792                                                   Register X,
2793                                                   Register Y,
2794                                                   bool IsDiv) const {
2795   const LLT S1 = LLT::scalar(1);
2796   const LLT S32 = LLT::scalar(32);
2797 
2798   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2799   // algorithm used here.
2800 
2801   // Initial estimate of inv(y).
2802   auto FloatY = B.buildUITOFP(S32, Y);
2803   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2804   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2805   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2806   auto Z = B.buildFPTOUI(S32, ScaledY);
2807 
2808   // One round of UNR.
2809   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2810   auto NegYZ = B.buildMul(S32, NegY, Z);
2811   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2812 
2813   // Quotient/remainder estimate.
2814   auto Q = B.buildUMulH(S32, X, Z);
2815   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2816 
2817   // First quotient/remainder refinement.
2818   auto One = B.buildConstant(S32, 1);
2819   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2820   if (IsDiv)
2821     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2822   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2823 
2824   // Second quotient/remainder refinement.
2825   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2826   if (IsDiv)
2827     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2828   else
2829     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2830 }
2831 
2832 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2833                                               MachineRegisterInfo &MRI,
2834                                               MachineIRBuilder &B) const {
2835   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2836   Register DstReg = MI.getOperand(0).getReg();
2837   Register Num = MI.getOperand(1).getReg();
2838   Register Den = MI.getOperand(2).getReg();
2839   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2840   MI.eraseFromParent();
2841   return true;
2842 }
2843 
2844 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2845 //
2846 // Return lo, hi of result
2847 //
2848 // %cvt.lo = G_UITOFP Val.lo
2849 // %cvt.hi = G_UITOFP Val.hi
2850 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2851 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2852 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2853 // %mul2 = G_FMUL %mul1, 2**(-32)
2854 // %trunc = G_INTRINSIC_TRUNC %mul2
2855 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2856 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2857 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2858                                                        Register Val) {
2859   const LLT S32 = LLT::scalar(32);
2860   auto Unmerge = B.buildUnmerge(S32, Val);
2861 
2862   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2863   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2864 
2865   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2866                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2867 
2868   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2869   auto Mul1 =
2870       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2871 
2872   // 2**(-32)
2873   auto Mul2 =
2874       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2875   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2876 
2877   // -(2**32)
2878   auto Mad2 = B.buildFMAD(S32, Trunc,
2879                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2880 
2881   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2882   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2883 
2884   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2885 }
2886 
2887 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2888                                                   Register DstReg,
2889                                                   Register Numer,
2890                                                   Register Denom,
2891                                                   bool IsDiv) const {
2892   const LLT S32 = LLT::scalar(32);
2893   const LLT S64 = LLT::scalar(64);
2894   const LLT S1 = LLT::scalar(1);
2895   Register RcpLo, RcpHi;
2896 
2897   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2898 
2899   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2900 
2901   auto Zero64 = B.buildConstant(S64, 0);
2902   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2903 
2904   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2905   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2906 
2907   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2908   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2909   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2910 
2911   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2912   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2913   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2914   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2915 
2916   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2917   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2918   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2919   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2920   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2921 
2922   auto Zero32 = B.buildConstant(S32, 0);
2923   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2924   auto Add2_HiC =
2925       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2926   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2927   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2928 
2929   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2930   Register NumerLo = UnmergeNumer.getReg(0);
2931   Register NumerHi = UnmergeNumer.getReg(1);
2932 
2933   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2934   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2935   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2936   Register Mul3_Lo = UnmergeMul3.getReg(0);
2937   Register Mul3_Hi = UnmergeMul3.getReg(1);
2938   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2939   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2940   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2941   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2942 
2943   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2944   Register DenomLo = UnmergeDenom.getReg(0);
2945   Register DenomHi = UnmergeDenom.getReg(1);
2946 
2947   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2948   auto C1 = B.buildSExt(S32, CmpHi);
2949 
2950   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2951   auto C2 = B.buildSExt(S32, CmpLo);
2952 
2953   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2954   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2955 
2956   // TODO: Here and below portions of the code can be enclosed into if/endif.
2957   // Currently control flow is unconditional and we have 4 selects after
2958   // potential endif to substitute PHIs.
2959 
2960   // if C3 != 0 ...
2961   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2962   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2963   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2964   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2965 
2966   auto One64 = B.buildConstant(S64, 1);
2967   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2968 
2969   auto C4 =
2970       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2971   auto C5 =
2972       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2973   auto C6 = B.buildSelect(
2974       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2975 
2976   // if (C6 != 0)
2977   auto Add4 = B.buildAdd(S64, Add3, One64);
2978   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2979 
2980   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2981   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2982   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2983 
2984   // endif C6
2985   // endif C3
2986 
2987   if (IsDiv) {
2988     auto Sel1 = B.buildSelect(
2989         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2990     B.buildSelect(DstReg,
2991                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2992   } else {
2993     auto Sel2 = B.buildSelect(
2994         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2995     B.buildSelect(DstReg,
2996                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2997   }
2998 }
2999 
3000 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
3001                                             MachineRegisterInfo &MRI,
3002                                             MachineIRBuilder &B) const {
3003   const LLT S64 = LLT::scalar(64);
3004   const LLT S32 = LLT::scalar(32);
3005   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
3006   Register DstReg = MI.getOperand(0).getReg();
3007   Register Num = MI.getOperand(1).getReg();
3008   Register Den = MI.getOperand(2).getReg();
3009   LLT Ty = MRI.getType(DstReg);
3010 
3011   if (Ty == S32)
3012     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3013   else if (Ty == S64)
3014     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3015   else
3016     return false;
3017 
3018   MI.eraseFromParent();
3019   return true;
3020 
3021 }
3022 
3023 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3024                                             MachineRegisterInfo &MRI,
3025                                             MachineIRBuilder &B) const {
3026   const LLT S64 = LLT::scalar(64);
3027   const LLT S32 = LLT::scalar(32);
3028 
3029   Register DstReg = MI.getOperand(0).getReg();
3030   const LLT Ty = MRI.getType(DstReg);
3031   if (Ty != S32 && Ty != S64)
3032     return false;
3033 
3034   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3035 
3036   Register LHS = MI.getOperand(1).getReg();
3037   Register RHS = MI.getOperand(2).getReg();
3038 
3039   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3040   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3041   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3042 
3043   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3044   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3045 
3046   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3047   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3048 
3049   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3050   if (Ty == S32)
3051     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3052   else
3053     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3054 
3055   Register Sign;
3056   if (IsDiv)
3057     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3058   else
3059     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3060 
3061   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3062   B.buildSub(DstReg, UDivRem, Sign);
3063 
3064   MI.eraseFromParent();
3065   return true;
3066 }
3067 
3068 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3069                                                  MachineRegisterInfo &MRI,
3070                                                  MachineIRBuilder &B) const {
3071   Register Res = MI.getOperand(0).getReg();
3072   Register LHS = MI.getOperand(1).getReg();
3073   Register RHS = MI.getOperand(2).getReg();
3074   uint16_t Flags = MI.getFlags();
3075   LLT ResTy = MRI.getType(Res);
3076 
3077   const MachineFunction &MF = B.getMF();
3078   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3079                             MI.getFlag(MachineInstr::FmAfn);
3080 
3081   if (!AllowInaccurateRcp)
3082     return false;
3083 
3084   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3085     // 1 / x -> RCP(x)
3086     if (CLHS->isExactlyValue(1.0)) {
3087       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3088         .addUse(RHS)
3089         .setMIFlags(Flags);
3090 
3091       MI.eraseFromParent();
3092       return true;
3093     }
3094 
3095     // -1 / x -> RCP( FNEG(x) )
3096     if (CLHS->isExactlyValue(-1.0)) {
3097       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3098       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3099         .addUse(FNeg.getReg(0))
3100         .setMIFlags(Flags);
3101 
3102       MI.eraseFromParent();
3103       return true;
3104     }
3105   }
3106 
3107   // x / y -> x * (1.0 / y)
3108   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3109     .addUse(RHS)
3110     .setMIFlags(Flags);
3111   B.buildFMul(Res, LHS, RCP, Flags);
3112 
3113   MI.eraseFromParent();
3114   return true;
3115 }
3116 
3117 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3118                                                    MachineRegisterInfo &MRI,
3119                                                    MachineIRBuilder &B) const {
3120   Register Res = MI.getOperand(0).getReg();
3121   Register X = MI.getOperand(1).getReg();
3122   Register Y = MI.getOperand(2).getReg();
3123   uint16_t Flags = MI.getFlags();
3124   LLT ResTy = MRI.getType(Res);
3125 
3126   const MachineFunction &MF = B.getMF();
3127   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3128                             MI.getFlag(MachineInstr::FmAfn);
3129 
3130   if (!AllowInaccurateRcp)
3131     return false;
3132 
3133   auto NegY = B.buildFNeg(ResTy, Y);
3134   auto One = B.buildFConstant(ResTy, 1.0);
3135 
3136   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3137     .addUse(Y)
3138     .setMIFlags(Flags);
3139 
3140   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3141   R = B.buildFMA(ResTy, Tmp0, R, R);
3142 
3143   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3144   R = B.buildFMA(ResTy, Tmp1, R, R);
3145 
3146   auto Ret = B.buildFMul(ResTy, X, R);
3147   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3148 
3149   B.buildFMA(Res, Tmp2, R, Ret);
3150   MI.eraseFromParent();
3151   return true;
3152 }
3153 
3154 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3155                                          MachineRegisterInfo &MRI,
3156                                          MachineIRBuilder &B) const {
3157   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3158     return true;
3159 
3160   Register Res = MI.getOperand(0).getReg();
3161   Register LHS = MI.getOperand(1).getReg();
3162   Register RHS = MI.getOperand(2).getReg();
3163 
3164   uint16_t Flags = MI.getFlags();
3165 
3166   LLT S16 = LLT::scalar(16);
3167   LLT S32 = LLT::scalar(32);
3168 
3169   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3170   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3171 
3172   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3173     .addUse(RHSExt.getReg(0))
3174     .setMIFlags(Flags);
3175 
3176   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3177   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3178 
3179   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3180     .addUse(RDst.getReg(0))
3181     .addUse(RHS)
3182     .addUse(LHS)
3183     .setMIFlags(Flags);
3184 
3185   MI.eraseFromParent();
3186   return true;
3187 }
3188 
3189 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3190 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3191 static void toggleSPDenormMode(bool Enable,
3192                                MachineIRBuilder &B,
3193                                const GCNSubtarget &ST,
3194                                AMDGPU::SIModeRegisterDefaults Mode) {
3195   // Set SP denorm mode to this value.
3196   unsigned SPDenormMode =
3197     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3198 
3199   if (ST.hasDenormModeInst()) {
3200     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3201     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3202 
3203     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3204     B.buildInstr(AMDGPU::S_DENORM_MODE)
3205       .addImm(NewDenormModeValue);
3206 
3207   } else {
3208     // Select FP32 bit field in mode register.
3209     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3210                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3211                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3212 
3213     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3214       .addImm(SPDenormMode)
3215       .addImm(SPDenormModeBitField);
3216   }
3217 }
3218 
3219 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3220                                          MachineRegisterInfo &MRI,
3221                                          MachineIRBuilder &B) const {
3222   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3223     return true;
3224 
3225   Register Res = MI.getOperand(0).getReg();
3226   Register LHS = MI.getOperand(1).getReg();
3227   Register RHS = MI.getOperand(2).getReg();
3228   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3229   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3230 
3231   uint16_t Flags = MI.getFlags();
3232 
3233   LLT S32 = LLT::scalar(32);
3234   LLT S1 = LLT::scalar(1);
3235 
3236   auto One = B.buildFConstant(S32, 1.0f);
3237 
3238   auto DenominatorScaled =
3239     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3240       .addUse(LHS)
3241       .addUse(RHS)
3242       .addImm(0)
3243       .setMIFlags(Flags);
3244   auto NumeratorScaled =
3245     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3246       .addUse(LHS)
3247       .addUse(RHS)
3248       .addImm(1)
3249       .setMIFlags(Flags);
3250 
3251   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3252     .addUse(DenominatorScaled.getReg(0))
3253     .setMIFlags(Flags);
3254   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3255 
3256   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3257   // aren't modeled as reading it.
3258   if (!Mode.allFP32Denormals())
3259     toggleSPDenormMode(true, B, ST, Mode);
3260 
3261   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3262   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3263   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3264   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3265   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3266   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3267 
3268   if (!Mode.allFP32Denormals())
3269     toggleSPDenormMode(false, B, ST, Mode);
3270 
3271   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3272     .addUse(Fma4.getReg(0))
3273     .addUse(Fma1.getReg(0))
3274     .addUse(Fma3.getReg(0))
3275     .addUse(NumeratorScaled.getReg(1))
3276     .setMIFlags(Flags);
3277 
3278   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3279     .addUse(Fmas.getReg(0))
3280     .addUse(RHS)
3281     .addUse(LHS)
3282     .setMIFlags(Flags);
3283 
3284   MI.eraseFromParent();
3285   return true;
3286 }
3287 
3288 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3289                                          MachineRegisterInfo &MRI,
3290                                          MachineIRBuilder &B) const {
3291   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3292     return true;
3293 
3294   Register Res = MI.getOperand(0).getReg();
3295   Register LHS = MI.getOperand(1).getReg();
3296   Register RHS = MI.getOperand(2).getReg();
3297 
3298   uint16_t Flags = MI.getFlags();
3299 
3300   LLT S64 = LLT::scalar(64);
3301   LLT S1 = LLT::scalar(1);
3302 
3303   auto One = B.buildFConstant(S64, 1.0);
3304 
3305   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3306     .addUse(LHS)
3307     .addUse(RHS)
3308     .addImm(0)
3309     .setMIFlags(Flags);
3310 
3311   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3312 
3313   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3314     .addUse(DivScale0.getReg(0))
3315     .setMIFlags(Flags);
3316 
3317   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3318   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3319   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3320 
3321   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3322     .addUse(LHS)
3323     .addUse(RHS)
3324     .addImm(1)
3325     .setMIFlags(Flags);
3326 
3327   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3328   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3329   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3330 
3331   Register Scale;
3332   if (!ST.hasUsableDivScaleConditionOutput()) {
3333     // Workaround a hardware bug on SI where the condition output from div_scale
3334     // is not usable.
3335 
3336     LLT S32 = LLT::scalar(32);
3337 
3338     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3339     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3340     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3341     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3342 
3343     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3344                               Scale1Unmerge.getReg(1));
3345     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3346                               Scale0Unmerge.getReg(1));
3347     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3348   } else {
3349     Scale = DivScale1.getReg(1);
3350   }
3351 
3352   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3353     .addUse(Fma4.getReg(0))
3354     .addUse(Fma3.getReg(0))
3355     .addUse(Mul.getReg(0))
3356     .addUse(Scale)
3357     .setMIFlags(Flags);
3358 
3359   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3360     .addUse(Fmas.getReg(0))
3361     .addUse(RHS)
3362     .addUse(LHS)
3363     .setMIFlags(Flags);
3364 
3365   MI.eraseFromParent();
3366   return true;
3367 }
3368 
3369 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3370                                                  MachineRegisterInfo &MRI,
3371                                                  MachineIRBuilder &B) const {
3372   Register Res = MI.getOperand(0).getReg();
3373   Register LHS = MI.getOperand(2).getReg();
3374   Register RHS = MI.getOperand(3).getReg();
3375   uint16_t Flags = MI.getFlags();
3376 
3377   LLT S32 = LLT::scalar(32);
3378   LLT S1 = LLT::scalar(1);
3379 
3380   auto Abs = B.buildFAbs(S32, RHS, Flags);
3381   const APFloat C0Val(1.0f);
3382 
3383   auto C0 = B.buildConstant(S32, 0x6f800000);
3384   auto C1 = B.buildConstant(S32, 0x2f800000);
3385   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3386 
3387   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3388   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3389 
3390   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3391 
3392   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3393     .addUse(Mul0.getReg(0))
3394     .setMIFlags(Flags);
3395 
3396   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3397 
3398   B.buildFMul(Res, Sel, Mul1, Flags);
3399 
3400   MI.eraseFromParent();
3401   return true;
3402 }
3403 
3404 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3405 // FIXME: Why do we handle this one but not other removed instructions?
3406 //
3407 // Reciprocal square root.  The clamp prevents infinite results, clamping
3408 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3409 // +-max_float.
3410 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3411                                                     MachineRegisterInfo &MRI,
3412                                                     MachineIRBuilder &B) const {
3413   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3414     return true;
3415 
3416   Register Dst = MI.getOperand(0).getReg();
3417   Register Src = MI.getOperand(2).getReg();
3418   auto Flags = MI.getFlags();
3419 
3420   LLT Ty = MRI.getType(Dst);
3421 
3422   const fltSemantics *FltSemantics;
3423   if (Ty == LLT::scalar(32))
3424     FltSemantics = &APFloat::IEEEsingle();
3425   else if (Ty == LLT::scalar(64))
3426     FltSemantics = &APFloat::IEEEdouble();
3427   else
3428     return false;
3429 
3430   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3431     .addUse(Src)
3432     .setMIFlags(Flags);
3433 
3434   // We don't need to concern ourselves with the snan handling difference, since
3435   // the rsq quieted (or not) so use the one which will directly select.
3436   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3437   const bool UseIEEE = MFI->getMode().IEEE;
3438 
3439   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3440   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3441                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3442 
3443   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3444 
3445   if (UseIEEE)
3446     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3447   else
3448     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3449   MI.eraseFromParent();
3450   return true;
3451 }
3452 
3453 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3454   switch (IID) {
3455   case Intrinsic::amdgcn_ds_fadd:
3456     return AMDGPU::G_ATOMICRMW_FADD;
3457   case Intrinsic::amdgcn_ds_fmin:
3458     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3459   case Intrinsic::amdgcn_ds_fmax:
3460     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3461   default:
3462     llvm_unreachable("not a DS FP intrinsic");
3463   }
3464 }
3465 
3466 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3467                                                       MachineInstr &MI,
3468                                                       Intrinsic::ID IID) const {
3469   GISelChangeObserver &Observer = Helper.Observer;
3470   Observer.changingInstr(MI);
3471 
3472   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3473 
3474   // The remaining operands were used to set fields in the MemOperand on
3475   // construction.
3476   for (int I = 6; I > 3; --I)
3477     MI.RemoveOperand(I);
3478 
3479   MI.RemoveOperand(1); // Remove the intrinsic ID.
3480   Observer.changedInstr(MI);
3481   return true;
3482 }
3483 
3484 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3485                                             MachineRegisterInfo &MRI,
3486                                             MachineIRBuilder &B) const {
3487   uint64_t Offset =
3488     ST.getTargetLowering()->getImplicitParameterOffset(
3489       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3490   LLT DstTy = MRI.getType(DstReg);
3491   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3492 
3493   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3494   if (!loadInputValue(KernargPtrReg, B,
3495                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3496     return false;
3497 
3498   // FIXME: This should be nuw
3499   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3500   return true;
3501 }
3502 
3503 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3504                                                  MachineRegisterInfo &MRI,
3505                                                  MachineIRBuilder &B) const {
3506   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3507   if (!MFI->isEntryFunction()) {
3508     return legalizePreloadedArgIntrin(MI, MRI, B,
3509                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3510   }
3511 
3512   Register DstReg = MI.getOperand(0).getReg();
3513   if (!getImplicitArgPtr(DstReg, MRI, B))
3514     return false;
3515 
3516   MI.eraseFromParent();
3517   return true;
3518 }
3519 
3520 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3521                                               MachineRegisterInfo &MRI,
3522                                               MachineIRBuilder &B,
3523                                               unsigned AddrSpace) const {
3524   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3525   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3526   Register Hi32 = Unmerge.getReg(1);
3527 
3528   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3529   MI.eraseFromParent();
3530   return true;
3531 }
3532 
3533 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3534 // offset (the offset that is included in bounds checking and swizzling, to be
3535 // split between the instruction's voffset and immoffset fields) and soffset
3536 // (the offset that is excluded from bounds checking and swizzling, to go in
3537 // the instruction's soffset field).  This function takes the first kind of
3538 // offset and figures out how to split it between voffset and immoffset.
3539 std::tuple<Register, unsigned, unsigned>
3540 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3541                                         Register OrigOffset) const {
3542   const unsigned MaxImm = 4095;
3543   Register BaseReg;
3544   unsigned TotalConstOffset;
3545   const LLT S32 = LLT::scalar(32);
3546   MachineRegisterInfo &MRI = *B.getMRI();
3547 
3548   std::tie(BaseReg, TotalConstOffset) =
3549       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3550 
3551   unsigned ImmOffset = TotalConstOffset;
3552 
3553   // If BaseReg is a pointer, convert it to int.
3554   if (MRI.getType(BaseReg).isPointer())
3555     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3556 
3557   // If the immediate value is too big for the immoffset field, put the value
3558   // and -4096 into the immoffset field so that the value that is copied/added
3559   // for the voffset field is a multiple of 4096, and it stands more chance
3560   // of being CSEd with the copy/add for another similar load/store.
3561   // However, do not do that rounding down to a multiple of 4096 if that is a
3562   // negative number, as it appears to be illegal to have a negative offset
3563   // in the vgpr, even if adding the immediate offset makes it positive.
3564   unsigned Overflow = ImmOffset & ~MaxImm;
3565   ImmOffset -= Overflow;
3566   if ((int32_t)Overflow < 0) {
3567     Overflow += ImmOffset;
3568     ImmOffset = 0;
3569   }
3570 
3571   if (Overflow != 0) {
3572     if (!BaseReg) {
3573       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3574     } else {
3575       auto OverflowVal = B.buildConstant(S32, Overflow);
3576       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3577     }
3578   }
3579 
3580   if (!BaseReg)
3581     BaseReg = B.buildConstant(S32, 0).getReg(0);
3582 
3583   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3584 }
3585 
3586 /// Handle register layout difference for f16 images for some subtargets.
3587 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3588                                              MachineRegisterInfo &MRI,
3589                                              Register Reg,
3590                                              bool ImageStore) const {
3591   const LLT S16 = LLT::scalar(16);
3592   const LLT S32 = LLT::scalar(32);
3593   LLT StoreVT = MRI.getType(Reg);
3594   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3595 
3596   if (ST.hasUnpackedD16VMem()) {
3597     auto Unmerge = B.buildUnmerge(S16, Reg);
3598 
3599     SmallVector<Register, 4> WideRegs;
3600     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3601       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3602 
3603     int NumElts = StoreVT.getNumElements();
3604 
3605     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3606   }
3607 
3608   if (ImageStore && ST.hasImageStoreD16Bug()) {
3609     if (StoreVT.getNumElements() == 2) {
3610       SmallVector<Register, 4> PackedRegs;
3611       Reg = B.buildBitcast(S32, Reg).getReg(0);
3612       PackedRegs.push_back(Reg);
3613       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3614       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3615     }
3616 
3617     if (StoreVT.getNumElements() == 3) {
3618       SmallVector<Register, 4> PackedRegs;
3619       auto Unmerge = B.buildUnmerge(S16, Reg);
3620       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3621         PackedRegs.push_back(Unmerge.getReg(I));
3622       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3623       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3624       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3625     }
3626 
3627     if (StoreVT.getNumElements() == 4) {
3628       SmallVector<Register, 4> PackedRegs;
3629       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3630       auto Unmerge = B.buildUnmerge(S32, Reg);
3631       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3632         PackedRegs.push_back(Unmerge.getReg(I));
3633       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3634       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3635     }
3636 
3637     llvm_unreachable("invalid data type");
3638   }
3639 
3640   return Reg;
3641 }
3642 
3643 Register AMDGPULegalizerInfo::fixStoreSourceType(
3644   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3645   MachineRegisterInfo *MRI = B.getMRI();
3646   LLT Ty = MRI->getType(VData);
3647 
3648   const LLT S16 = LLT::scalar(16);
3649 
3650   // Fixup illegal register types for i8 stores.
3651   if (Ty == LLT::scalar(8) || Ty == S16) {
3652     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3653     return AnyExt;
3654   }
3655 
3656   if (Ty.isVector()) {
3657     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3658       if (IsFormat)
3659         return handleD16VData(B, *MRI, VData);
3660     }
3661   }
3662 
3663   return VData;
3664 }
3665 
3666 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3667                                               MachineRegisterInfo &MRI,
3668                                               MachineIRBuilder &B,
3669                                               bool IsTyped,
3670                                               bool IsFormat) const {
3671   Register VData = MI.getOperand(1).getReg();
3672   LLT Ty = MRI.getType(VData);
3673   LLT EltTy = Ty.getScalarType();
3674   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3675   const LLT S32 = LLT::scalar(32);
3676 
3677   VData = fixStoreSourceType(B, VData, IsFormat);
3678   Register RSrc = MI.getOperand(2).getReg();
3679 
3680   MachineMemOperand *MMO = *MI.memoperands_begin();
3681   const int MemSize = MMO->getSize();
3682 
3683   unsigned ImmOffset;
3684   unsigned TotalOffset;
3685 
3686   // The typed intrinsics add an immediate after the registers.
3687   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3688 
3689   // The struct intrinsic variants add one additional operand over raw.
3690   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3691   Register VIndex;
3692   int OpOffset = 0;
3693   if (HasVIndex) {
3694     VIndex = MI.getOperand(3).getReg();
3695     OpOffset = 1;
3696   }
3697 
3698   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3699   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3700 
3701   unsigned Format = 0;
3702   if (IsTyped) {
3703     Format = MI.getOperand(5 + OpOffset).getImm();
3704     ++OpOffset;
3705   }
3706 
3707   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3708 
3709   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3710   if (TotalOffset != 0)
3711     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3712 
3713   unsigned Opc;
3714   if (IsTyped) {
3715     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3716                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3717   } else if (IsFormat) {
3718     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3719                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3720   } else {
3721     switch (MemSize) {
3722     case 1:
3723       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3724       break;
3725     case 2:
3726       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3727       break;
3728     default:
3729       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3730       break;
3731     }
3732   }
3733 
3734   if (!VIndex)
3735     VIndex = B.buildConstant(S32, 0).getReg(0);
3736 
3737   auto MIB = B.buildInstr(Opc)
3738     .addUse(VData)              // vdata
3739     .addUse(RSrc)               // rsrc
3740     .addUse(VIndex)             // vindex
3741     .addUse(VOffset)            // voffset
3742     .addUse(SOffset)            // soffset
3743     .addImm(ImmOffset);         // offset(imm)
3744 
3745   if (IsTyped)
3746     MIB.addImm(Format);
3747 
3748   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3749      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3750      .addMemOperand(MMO);
3751 
3752   MI.eraseFromParent();
3753   return true;
3754 }
3755 
3756 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3757                                              MachineRegisterInfo &MRI,
3758                                              MachineIRBuilder &B,
3759                                              bool IsFormat,
3760                                              bool IsTyped) const {
3761   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3762   MachineMemOperand *MMO = *MI.memoperands_begin();
3763   const int MemSize = MMO->getSize();
3764   const LLT S32 = LLT::scalar(32);
3765 
3766   Register Dst = MI.getOperand(0).getReg();
3767   Register RSrc = MI.getOperand(2).getReg();
3768 
3769   // The typed intrinsics add an immediate after the registers.
3770   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3771 
3772   // The struct intrinsic variants add one additional operand over raw.
3773   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3774   Register VIndex;
3775   int OpOffset = 0;
3776   if (HasVIndex) {
3777     VIndex = MI.getOperand(3).getReg();
3778     OpOffset = 1;
3779   }
3780 
3781   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3782   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3783 
3784   unsigned Format = 0;
3785   if (IsTyped) {
3786     Format = MI.getOperand(5 + OpOffset).getImm();
3787     ++OpOffset;
3788   }
3789 
3790   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3791   unsigned ImmOffset;
3792   unsigned TotalOffset;
3793 
3794   LLT Ty = MRI.getType(Dst);
3795   LLT EltTy = Ty.getScalarType();
3796   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3797   const bool Unpacked = ST.hasUnpackedD16VMem();
3798 
3799   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3800   if (TotalOffset != 0)
3801     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3802 
3803   unsigned Opc;
3804 
3805   if (IsTyped) {
3806     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3807                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3808   } else if (IsFormat) {
3809     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3810                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3811   } else {
3812     switch (MemSize) {
3813     case 1:
3814       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3815       break;
3816     case 2:
3817       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3818       break;
3819     default:
3820       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3821       break;
3822     }
3823   }
3824 
3825   Register LoadDstReg;
3826 
3827   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3828   LLT UnpackedTy = Ty.changeElementSize(32);
3829 
3830   if (IsExtLoad)
3831     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3832   else if (Unpacked && IsD16 && Ty.isVector())
3833     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3834   else
3835     LoadDstReg = Dst;
3836 
3837   if (!VIndex)
3838     VIndex = B.buildConstant(S32, 0).getReg(0);
3839 
3840   auto MIB = B.buildInstr(Opc)
3841     .addDef(LoadDstReg)         // vdata
3842     .addUse(RSrc)               // rsrc
3843     .addUse(VIndex)             // vindex
3844     .addUse(VOffset)            // voffset
3845     .addUse(SOffset)            // soffset
3846     .addImm(ImmOffset);         // offset(imm)
3847 
3848   if (IsTyped)
3849     MIB.addImm(Format);
3850 
3851   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3852      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3853      .addMemOperand(MMO);
3854 
3855   if (LoadDstReg != Dst) {
3856     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3857 
3858     // Widen result for extending loads was widened.
3859     if (IsExtLoad)
3860       B.buildTrunc(Dst, LoadDstReg);
3861     else {
3862       // Repack to original 16-bit vector result
3863       // FIXME: G_TRUNC should work, but legalization currently fails
3864       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3865       SmallVector<Register, 4> Repack;
3866       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3867         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3868       B.buildMerge(Dst, Repack);
3869     }
3870   }
3871 
3872   MI.eraseFromParent();
3873   return true;
3874 }
3875 
3876 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3877                                                MachineIRBuilder &B,
3878                                                bool IsInc) const {
3879   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3880                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3881   B.buildInstr(Opc)
3882     .addDef(MI.getOperand(0).getReg())
3883     .addUse(MI.getOperand(2).getReg())
3884     .addUse(MI.getOperand(3).getReg())
3885     .cloneMemRefs(MI);
3886   MI.eraseFromParent();
3887   return true;
3888 }
3889 
3890 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3891   switch (IntrID) {
3892   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3893   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3894     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3895   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3896   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3897     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3898   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3899   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3900     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3901   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3902   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3903     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3904   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3905   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3906     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3907   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3908   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3909     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3910   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3911   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3912     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3913   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3914   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3915     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3916   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3917   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3918     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3919   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3920   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3921     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3922   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3923   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3924     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3925   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3926   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3927     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3928   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3929   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3930     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3931   case Intrinsic::amdgcn_buffer_atomic_fadd:
3932   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3933   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3934     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3935   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3936   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3937     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3938   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3939   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3940     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3941   default:
3942     llvm_unreachable("unhandled atomic opcode");
3943   }
3944 }
3945 
3946 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3947                                                MachineIRBuilder &B,
3948                                                Intrinsic::ID IID) const {
3949   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3950                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3951   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3952 
3953   Register Dst;
3954 
3955   int OpOffset = 0;
3956   if (HasReturn) {
3957     // A few FP atomics do not support return values.
3958     Dst = MI.getOperand(0).getReg();
3959   } else {
3960     OpOffset = -1;
3961   }
3962 
3963   Register VData = MI.getOperand(2 + OpOffset).getReg();
3964   Register CmpVal;
3965 
3966   if (IsCmpSwap) {
3967     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3968     ++OpOffset;
3969   }
3970 
3971   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3972   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3973 
3974   // The struct intrinsic variants add one additional operand over raw.
3975   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3976   Register VIndex;
3977   if (HasVIndex) {
3978     VIndex = MI.getOperand(4 + OpOffset).getReg();
3979     ++OpOffset;
3980   }
3981 
3982   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3983   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3984   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3985 
3986   MachineMemOperand *MMO = *MI.memoperands_begin();
3987 
3988   unsigned ImmOffset;
3989   unsigned TotalOffset;
3990   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3991   if (TotalOffset != 0)
3992     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3993 
3994   if (!VIndex)
3995     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3996 
3997   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3998 
3999   if (HasReturn)
4000     MIB.addDef(Dst);
4001 
4002   MIB.addUse(VData); // vdata
4003 
4004   if (IsCmpSwap)
4005     MIB.addReg(CmpVal);
4006 
4007   MIB.addUse(RSrc)               // rsrc
4008      .addUse(VIndex)             // vindex
4009      .addUse(VOffset)            // voffset
4010      .addUse(SOffset)            // soffset
4011      .addImm(ImmOffset)          // offset(imm)
4012      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4013      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4014      .addMemOperand(MMO);
4015 
4016   MI.eraseFromParent();
4017   return true;
4018 }
4019 
4020 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4021 /// vector with s16 typed elements.
4022 static void packImageA16AddressToDwords(
4023     MachineIRBuilder &B, MachineInstr &MI,
4024     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4025     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4026   const LLT S16 = LLT::scalar(16);
4027   const LLT V2S16 = LLT::vector(2, 16);
4028 
4029   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4030     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4031     if (!SrcOp.isReg())
4032       continue; // _L to _LZ may have eliminated this.
4033 
4034     Register AddrReg = SrcOp.getReg();
4035 
4036     if (I < Intr->GradientStart) {
4037       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4038       PackedAddrs.push_back(AddrReg);
4039     } else {
4040       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4041       // derivatives dx/dh and dx/dv are packed with undef.
4042       if (((I + 1) >= EndIdx) ||
4043           ((Intr->NumGradients / 2) % 2 == 1 &&
4044            (I == static_cast<unsigned>(Intr->GradientStart +
4045                                        (Intr->NumGradients / 2) - 1) ||
4046             I == static_cast<unsigned>(Intr->GradientStart +
4047                                        Intr->NumGradients - 1))) ||
4048           // Check for _L to _LZ optimization
4049           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4050         PackedAddrs.push_back(
4051             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4052                 .getReg(0));
4053       } else {
4054         PackedAddrs.push_back(
4055             B.buildBuildVector(
4056                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4057                 .getReg(0));
4058         ++I;
4059       }
4060     }
4061   }
4062 }
4063 
4064 /// Convert from separate vaddr components to a single vector address register,
4065 /// and replace the remaining operands with $noreg.
4066 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4067                                      int DimIdx, int NumVAddrs) {
4068   const LLT S32 = LLT::scalar(32);
4069 
4070   SmallVector<Register, 8> AddrRegs;
4071   for (int I = 0; I != NumVAddrs; ++I) {
4072     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4073     if (SrcOp.isReg()) {
4074       AddrRegs.push_back(SrcOp.getReg());
4075       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4076     }
4077   }
4078 
4079   int NumAddrRegs = AddrRegs.size();
4080   if (NumAddrRegs != 1) {
4081     // Round up to 8 elements for v5-v7
4082     // FIXME: Missing intermediate sized register classes and instructions.
4083     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4084       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4085       auto Undef = B.buildUndef(S32);
4086       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4087       NumAddrRegs = RoundedNumRegs;
4088     }
4089 
4090     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4091     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4092   }
4093 
4094   for (int I = 1; I != NumVAddrs; ++I) {
4095     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4096     if (SrcOp.isReg())
4097       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4098   }
4099 }
4100 
4101 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4102 ///
4103 /// Depending on the subtarget, load/store with 16-bit element data need to be
4104 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4105 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4106 /// registers.
4107 ///
4108 /// We don't want to directly select image instructions just yet, but also want
4109 /// to exposes all register repacking to the legalizer/combiners. We also don't
4110 /// want a selected instrution entering RegBankSelect. In order to avoid
4111 /// defining a multitude of intermediate image instructions, directly hack on
4112 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4113 /// now unnecessary arguments with $noreg.
4114 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4115     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4116     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4117 
4118   const unsigned NumDefs = MI.getNumExplicitDefs();
4119   const unsigned ArgOffset = NumDefs + 1;
4120   bool IsTFE = NumDefs == 2;
4121   // We are only processing the operands of d16 image operations on subtargets
4122   // that use the unpacked register layout, or need to repack the TFE result.
4123 
4124   // TODO: Do we need to guard against already legalized intrinsics?
4125   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4126       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4127 
4128   MachineRegisterInfo *MRI = B.getMRI();
4129   const LLT S32 = LLT::scalar(32);
4130   const LLT S16 = LLT::scalar(16);
4131   const LLT V2S16 = LLT::vector(2, 16);
4132 
4133   unsigned DMask = 0;
4134 
4135   // Check for 16 bit addresses and pack if true.
4136   LLT GradTy =
4137       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4138   LLT AddrTy =
4139       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4140   const bool IsG16 = GradTy == S16;
4141   const bool IsA16 = AddrTy == S16;
4142 
4143   int DMaskLanes = 0;
4144   if (!BaseOpcode->Atomic) {
4145     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4146     if (BaseOpcode->Gather4) {
4147       DMaskLanes = 4;
4148     } else if (DMask != 0) {
4149       DMaskLanes = countPopulation(DMask);
4150     } else if (!IsTFE && !BaseOpcode->Store) {
4151       // If dmask is 0, this is a no-op load. This can be eliminated.
4152       B.buildUndef(MI.getOperand(0));
4153       MI.eraseFromParent();
4154       return true;
4155     }
4156   }
4157 
4158   Observer.changingInstr(MI);
4159   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4160 
4161   unsigned NewOpcode = NumDefs == 0 ?
4162     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4163 
4164   // Track that we legalized this
4165   MI.setDesc(B.getTII().get(NewOpcode));
4166 
4167   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4168   // dmask to be at least 1 otherwise the instruction will fail
4169   if (IsTFE && DMask == 0) {
4170     DMask = 0x1;
4171     DMaskLanes = 1;
4172     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4173   }
4174 
4175   if (BaseOpcode->Atomic) {
4176     Register VData0 = MI.getOperand(2).getReg();
4177     LLT Ty = MRI->getType(VData0);
4178 
4179     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4180     if (Ty.isVector())
4181       return false;
4182 
4183     if (BaseOpcode->AtomicX2) {
4184       Register VData1 = MI.getOperand(3).getReg();
4185       // The two values are packed in one register.
4186       LLT PackedTy = LLT::vector(2, Ty);
4187       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4188       MI.getOperand(2).setReg(Concat.getReg(0));
4189       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4190     }
4191   }
4192 
4193   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4194 
4195   // Optimize _L to _LZ when _L is zero
4196   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4197           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4198     const ConstantFP *ConstantLod;
4199 
4200     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4201                  m_GFCst(ConstantLod))) {
4202       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4203         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4204         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4205             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4206                                                       Intr->Dim);
4207 
4208         // The starting indexes should remain in the same place.
4209         --CorrectedNumVAddrs;
4210 
4211         MI.getOperand(MI.getNumExplicitDefs())
4212             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4213         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4214         Intr = NewImageDimIntr;
4215       }
4216     }
4217   }
4218 
4219   // Optimize _mip away, when 'lod' is zero
4220   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4221     int64_t ConstantLod;
4222     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4223                  m_ICst(ConstantLod))) {
4224       if (ConstantLod == 0) {
4225         // TODO: Change intrinsic opcode and remove operand instead or replacing
4226         // it with 0, as the _L to _LZ handling is done above.
4227         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4228         --CorrectedNumVAddrs;
4229       }
4230     }
4231   }
4232 
4233   // Rewrite the addressing register layout before doing anything else.
4234   if (IsA16 || IsG16) {
4235     if (IsA16) {
4236       // Target must support the feature and gradients need to be 16 bit too
4237       if (!ST.hasA16() || !IsG16)
4238         return false;
4239     } else if (!ST.hasG16())
4240       return false;
4241 
4242     if (Intr->NumVAddrs > 1) {
4243       SmallVector<Register, 4> PackedRegs;
4244       // Don't compress addresses for G16
4245       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4246       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4247                                   PackEndIdx);
4248 
4249       if (!IsA16) {
4250         // Add uncompressed address
4251         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4252           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4253           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4254           PackedRegs.push_back(AddrReg);
4255         }
4256       }
4257 
4258       // See also below in the non-a16 branch
4259       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4260 
4261       if (!UseNSA && PackedRegs.size() > 1) {
4262         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4263         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4264         PackedRegs[0] = Concat.getReg(0);
4265         PackedRegs.resize(1);
4266       }
4267 
4268       const unsigned NumPacked = PackedRegs.size();
4269       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4270         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4271         if (!SrcOp.isReg()) {
4272           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4273           continue;
4274         }
4275 
4276         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4277 
4278         if (I - Intr->VAddrStart < NumPacked)
4279           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4280         else
4281           SrcOp.setReg(AMDGPU::NoRegister);
4282       }
4283     }
4284   } else {
4285     // If the register allocator cannot place the address registers contiguously
4286     // without introducing moves, then using the non-sequential address encoding
4287     // is always preferable, since it saves VALU instructions and is usually a
4288     // wash in terms of code size or even better.
4289     //
4290     // However, we currently have no way of hinting to the register allocator
4291     // that MIMG addresses should be placed contiguously when it is possible to
4292     // do so, so force non-NSA for the common 2-address case as a heuristic.
4293     //
4294     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4295     // allocation when possible.
4296     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4297 
4298     if (!UseNSA && Intr->NumVAddrs > 1)
4299       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4300                                Intr->NumVAddrs);
4301   }
4302 
4303   int Flags = 0;
4304   if (IsA16)
4305     Flags |= 1;
4306   if (IsG16)
4307     Flags |= 2;
4308   MI.addOperand(MachineOperand::CreateImm(Flags));
4309 
4310   if (BaseOpcode->Store) { // No TFE for stores?
4311     // TODO: Handle dmask trim
4312     Register VData = MI.getOperand(1).getReg();
4313     LLT Ty = MRI->getType(VData);
4314     if (!Ty.isVector() || Ty.getElementType() != S16)
4315       return true;
4316 
4317     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4318     if (RepackedReg != VData) {
4319       MI.getOperand(1).setReg(RepackedReg);
4320     }
4321 
4322     return true;
4323   }
4324 
4325   Register DstReg = MI.getOperand(0).getReg();
4326   LLT Ty = MRI->getType(DstReg);
4327   const LLT EltTy = Ty.getScalarType();
4328   const bool IsD16 = Ty.getScalarType() == S16;
4329   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4330 
4331   // Confirm that the return type is large enough for the dmask specified
4332   if (NumElts < DMaskLanes)
4333     return false;
4334 
4335   if (NumElts > 4 || DMaskLanes > 4)
4336     return false;
4337 
4338   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4339   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4340 
4341   // The raw dword aligned data component of the load. The only legal cases
4342   // where this matters should be when using the packed D16 format, for
4343   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4344   LLT RoundedTy;
4345 
4346   // S32 vector to to cover all data, plus TFE result element.
4347   LLT TFETy;
4348 
4349   // Register type to use for each loaded component. Will be S32 or V2S16.
4350   LLT RegTy;
4351 
4352   if (IsD16 && ST.hasUnpackedD16VMem()) {
4353     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4354     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4355     RegTy = S32;
4356   } else {
4357     unsigned EltSize = EltTy.getSizeInBits();
4358     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4359     unsigned RoundedSize = 32 * RoundedElts;
4360     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4361     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4362     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4363   }
4364 
4365   // The return type does not need adjustment.
4366   // TODO: Should we change s16 case to s32 or <2 x s16>?
4367   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4368     return true;
4369 
4370   Register Dst1Reg;
4371 
4372   // Insert after the instruction.
4373   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4374 
4375   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4376   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4377   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4378   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4379 
4380   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4381 
4382   MI.getOperand(0).setReg(NewResultReg);
4383 
4384   // In the IR, TFE is supposed to be used with a 2 element struct return
4385   // type. The intruction really returns these two values in one contiguous
4386   // register, with one additional dword beyond the loaded data. Rewrite the
4387   // return type to use a single register result.
4388 
4389   if (IsTFE) {
4390     Dst1Reg = MI.getOperand(1).getReg();
4391     if (MRI->getType(Dst1Reg) != S32)
4392       return false;
4393 
4394     // TODO: Make sure the TFE operand bit is set.
4395     MI.RemoveOperand(1);
4396 
4397     // Handle the easy case that requires no repack instructions.
4398     if (Ty == S32) {
4399       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4400       return true;
4401     }
4402   }
4403 
4404   // Now figure out how to copy the new result register back into the old
4405   // result.
4406   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4407 
4408   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4409 
4410   if (ResultNumRegs == 1) {
4411     assert(!IsTFE);
4412     ResultRegs[0] = NewResultReg;
4413   } else {
4414     // We have to repack into a new vector of some kind.
4415     for (int I = 0; I != NumDataRegs; ++I)
4416       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4417     B.buildUnmerge(ResultRegs, NewResultReg);
4418 
4419     // Drop the final TFE element to get the data part. The TFE result is
4420     // directly written to the right place already.
4421     if (IsTFE)
4422       ResultRegs.resize(NumDataRegs);
4423   }
4424 
4425   // For an s16 scalar result, we form an s32 result with a truncate regardless
4426   // of packed vs. unpacked.
4427   if (IsD16 && !Ty.isVector()) {
4428     B.buildTrunc(DstReg, ResultRegs[0]);
4429     return true;
4430   }
4431 
4432   // Avoid a build/concat_vector of 1 entry.
4433   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4434     B.buildBitcast(DstReg, ResultRegs[0]);
4435     return true;
4436   }
4437 
4438   assert(Ty.isVector());
4439 
4440   if (IsD16) {
4441     // For packed D16 results with TFE enabled, all the data components are
4442     // S32. Cast back to the expected type.
4443     //
4444     // TODO: We don't really need to use load s32 elements. We would only need one
4445     // cast for the TFE result if a multiple of v2s16 was used.
4446     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4447       for (Register &Reg : ResultRegs)
4448         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4449     } else if (ST.hasUnpackedD16VMem()) {
4450       for (Register &Reg : ResultRegs)
4451         Reg = B.buildTrunc(S16, Reg).getReg(0);
4452     }
4453   }
4454 
4455   auto padWithUndef = [&](LLT Ty, int NumElts) {
4456     if (NumElts == 0)
4457       return;
4458     Register Undef = B.buildUndef(Ty).getReg(0);
4459     for (int I = 0; I != NumElts; ++I)
4460       ResultRegs.push_back(Undef);
4461   };
4462 
4463   // Pad out any elements eliminated due to the dmask.
4464   LLT ResTy = MRI->getType(ResultRegs[0]);
4465   if (!ResTy.isVector()) {
4466     padWithUndef(ResTy, NumElts - ResultRegs.size());
4467     B.buildBuildVector(DstReg, ResultRegs);
4468     return true;
4469   }
4470 
4471   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4472   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4473 
4474   // Deal with the one annoying legal case.
4475   const LLT V3S16 = LLT::vector(3, 16);
4476   if (Ty == V3S16) {
4477     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4478     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4479     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4480     return true;
4481   }
4482 
4483   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4484   B.buildConcatVectors(DstReg, ResultRegs);
4485   return true;
4486 }
4487 
4488 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4489   LegalizerHelper &Helper, MachineInstr &MI) const {
4490   MachineIRBuilder &B = Helper.MIRBuilder;
4491   GISelChangeObserver &Observer = Helper.Observer;
4492 
4493   Register Dst = MI.getOperand(0).getReg();
4494   LLT Ty = B.getMRI()->getType(Dst);
4495   unsigned Size = Ty.getSizeInBits();
4496   MachineFunction &MF = B.getMF();
4497 
4498   Observer.changingInstr(MI);
4499 
4500   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4501     Ty = getBitcastRegisterType(Ty);
4502     Helper.bitcastDst(MI, Ty, 0);
4503     Dst = MI.getOperand(0).getReg();
4504     B.setInsertPt(B.getMBB(), MI);
4505   }
4506 
4507   // FIXME: We don't really need this intermediate instruction. The intrinsic
4508   // should be fixed to have a memory operand. Since it's readnone, we're not
4509   // allowed to add one.
4510   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4511   MI.RemoveOperand(1); // Remove intrinsic ID
4512 
4513   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4514   // TODO: Should this use datalayout alignment?
4515   const unsigned MemSize = (Size + 7) / 8;
4516   const Align MemAlign(4);
4517   MachineMemOperand *MMO = MF.getMachineMemOperand(
4518       MachinePointerInfo(),
4519       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4520           MachineMemOperand::MOInvariant,
4521       MemSize, MemAlign);
4522   MI.addMemOperand(MF, MMO);
4523 
4524   // There are no 96-bit result scalar loads, but widening to 128-bit should
4525   // always be legal. We may need to restore this to a 96-bit result if it turns
4526   // out this needs to be converted to a vector load during RegBankSelect.
4527   if (!isPowerOf2_32(Size)) {
4528     if (Ty.isVector())
4529       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4530     else
4531       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4532   }
4533 
4534   Observer.changedInstr(MI);
4535   return true;
4536 }
4537 
4538 // TODO: Move to selection
4539 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4540                                                 MachineRegisterInfo &MRI,
4541                                                 MachineIRBuilder &B) const {
4542   if (!ST.isTrapHandlerEnabled() ||
4543       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4544     return legalizeTrapEndpgm(MI, MRI, B);
4545 
4546   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4547     switch (*HsaAbiVer) {
4548     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4549     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4550       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4551     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4552       return ST.supportsGetDoorbellID() ?
4553           legalizeTrapHsa(MI, MRI, B) :
4554           legalizeTrapHsaQueuePtr(MI, MRI, B);
4555     }
4556   }
4557 
4558   llvm_unreachable("Unknown trap handler");
4559 }
4560 
4561 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4562     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4563   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4564   MI.eraseFromParent();
4565   return true;
4566 }
4567 
4568 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4569     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4570   // Pass queue pointer to trap handler as input, and insert trap instruction
4571   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4572   Register LiveIn =
4573     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4574   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4575     return false;
4576 
4577   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4578   B.buildCopy(SGPR01, LiveIn);
4579   B.buildInstr(AMDGPU::S_TRAP)
4580       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4581       .addReg(SGPR01, RegState::Implicit);
4582 
4583   MI.eraseFromParent();
4584   return true;
4585 }
4586 
4587 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4588     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4589   B.buildInstr(AMDGPU::S_TRAP)
4590       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4591   MI.eraseFromParent();
4592   return true;
4593 }
4594 
4595 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4596     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4597   // Is non-HSA path or trap-handler disabled? then, report a warning
4598   // accordingly
4599   if (!ST.isTrapHandlerEnabled() ||
4600       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4601     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4602                                      "debugtrap handler not supported",
4603                                      MI.getDebugLoc(), DS_Warning);
4604     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4605     Ctx.diagnose(NoTrap);
4606   } else {
4607     // Insert debug-trap instruction
4608     B.buildInstr(AMDGPU::S_TRAP)
4609         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4610   }
4611 
4612   MI.eraseFromParent();
4613   return true;
4614 }
4615 
4616 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4617                                                MachineIRBuilder &B) const {
4618   MachineRegisterInfo &MRI = *B.getMRI();
4619   const LLT S16 = LLT::scalar(16);
4620   const LLT S32 = LLT::scalar(32);
4621 
4622   Register DstReg = MI.getOperand(0).getReg();
4623   Register NodePtr = MI.getOperand(2).getReg();
4624   Register RayExtent = MI.getOperand(3).getReg();
4625   Register RayOrigin = MI.getOperand(4).getReg();
4626   Register RayDir = MI.getOperand(5).getReg();
4627   Register RayInvDir = MI.getOperand(6).getReg();
4628   Register TDescr = MI.getOperand(7).getReg();
4629 
4630   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4631   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4632   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4633                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4634                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4635                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4636 
4637   SmallVector<Register, 12> Ops;
4638   if (Is64) {
4639     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4640     Ops.push_back(Unmerge.getReg(0));
4641     Ops.push_back(Unmerge.getReg(1));
4642   } else {
4643     Ops.push_back(NodePtr);
4644   }
4645   Ops.push_back(RayExtent);
4646 
4647   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4648     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4649     Ops.push_back(Unmerge.getReg(0));
4650     Ops.push_back(Unmerge.getReg(1));
4651     Ops.push_back(Unmerge.getReg(2));
4652   };
4653 
4654   packLanes(RayOrigin);
4655   if (IsA16) {
4656     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4657     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4658     Register R1 = MRI.createGenericVirtualRegister(S32);
4659     Register R2 = MRI.createGenericVirtualRegister(S32);
4660     Register R3 = MRI.createGenericVirtualRegister(S32);
4661     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4662     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4663     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4664     Ops.push_back(R1);
4665     Ops.push_back(R2);
4666     Ops.push_back(R3);
4667   } else {
4668     packLanes(RayDir);
4669     packLanes(RayInvDir);
4670   }
4671 
4672   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4673     .addDef(DstReg)
4674     .addImm(Opcode);
4675 
4676   for (Register R : Ops) {
4677     MIB.addUse(R);
4678   }
4679 
4680   MIB.addUse(TDescr)
4681      .addImm(IsA16 ? 1 : 0)
4682      .cloneMemRefs(MI);
4683 
4684   MI.eraseFromParent();
4685   return true;
4686 }
4687 
4688 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4689                                             MachineInstr &MI) const {
4690   MachineIRBuilder &B = Helper.MIRBuilder;
4691   MachineRegisterInfo &MRI = *B.getMRI();
4692 
4693   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4694   auto IntrID = MI.getIntrinsicID();
4695   switch (IntrID) {
4696   case Intrinsic::amdgcn_if:
4697   case Intrinsic::amdgcn_else: {
4698     MachineInstr *Br = nullptr;
4699     MachineBasicBlock *UncondBrTarget = nullptr;
4700     bool Negated = false;
4701     if (MachineInstr *BrCond =
4702             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4703       const SIRegisterInfo *TRI
4704         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4705 
4706       Register Def = MI.getOperand(1).getReg();
4707       Register Use = MI.getOperand(3).getReg();
4708 
4709       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4710 
4711       if (Negated)
4712         std::swap(CondBrTarget, UncondBrTarget);
4713 
4714       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4715       if (IntrID == Intrinsic::amdgcn_if) {
4716         B.buildInstr(AMDGPU::SI_IF)
4717           .addDef(Def)
4718           .addUse(Use)
4719           .addMBB(UncondBrTarget);
4720       } else {
4721         B.buildInstr(AMDGPU::SI_ELSE)
4722             .addDef(Def)
4723             .addUse(Use)
4724             .addMBB(UncondBrTarget);
4725       }
4726 
4727       if (Br) {
4728         Br->getOperand(0).setMBB(CondBrTarget);
4729       } else {
4730         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4731         // since we're swapping branch targets it needs to be reinserted.
4732         // FIXME: IRTranslator should probably not do this
4733         B.buildBr(*CondBrTarget);
4734       }
4735 
4736       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4737       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4738       MI.eraseFromParent();
4739       BrCond->eraseFromParent();
4740       return true;
4741     }
4742 
4743     return false;
4744   }
4745   case Intrinsic::amdgcn_loop: {
4746     MachineInstr *Br = nullptr;
4747     MachineBasicBlock *UncondBrTarget = nullptr;
4748     bool Negated = false;
4749     if (MachineInstr *BrCond =
4750             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4751       const SIRegisterInfo *TRI
4752         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4753 
4754       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4755       Register Reg = MI.getOperand(2).getReg();
4756 
4757       if (Negated)
4758         std::swap(CondBrTarget, UncondBrTarget);
4759 
4760       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4761       B.buildInstr(AMDGPU::SI_LOOP)
4762         .addUse(Reg)
4763         .addMBB(UncondBrTarget);
4764 
4765       if (Br)
4766         Br->getOperand(0).setMBB(CondBrTarget);
4767       else
4768         B.buildBr(*CondBrTarget);
4769 
4770       MI.eraseFromParent();
4771       BrCond->eraseFromParent();
4772       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4773       return true;
4774     }
4775 
4776     return false;
4777   }
4778   case Intrinsic::amdgcn_kernarg_segment_ptr:
4779     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4780       // This only makes sense to call in a kernel, so just lower to null.
4781       B.buildConstant(MI.getOperand(0).getReg(), 0);
4782       MI.eraseFromParent();
4783       return true;
4784     }
4785 
4786     return legalizePreloadedArgIntrin(
4787       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4788   case Intrinsic::amdgcn_implicitarg_ptr:
4789     return legalizeImplicitArgPtr(MI, MRI, B);
4790   case Intrinsic::amdgcn_workitem_id_x:
4791     return legalizePreloadedArgIntrin(MI, MRI, B,
4792                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4793   case Intrinsic::amdgcn_workitem_id_y:
4794     return legalizePreloadedArgIntrin(MI, MRI, B,
4795                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4796   case Intrinsic::amdgcn_workitem_id_z:
4797     return legalizePreloadedArgIntrin(MI, MRI, B,
4798                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4799   case Intrinsic::amdgcn_workgroup_id_x:
4800     return legalizePreloadedArgIntrin(MI, MRI, B,
4801                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4802   case Intrinsic::amdgcn_workgroup_id_y:
4803     return legalizePreloadedArgIntrin(MI, MRI, B,
4804                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4805   case Intrinsic::amdgcn_workgroup_id_z:
4806     return legalizePreloadedArgIntrin(MI, MRI, B,
4807                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4808   case Intrinsic::amdgcn_dispatch_ptr:
4809     return legalizePreloadedArgIntrin(MI, MRI, B,
4810                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4811   case Intrinsic::amdgcn_queue_ptr:
4812     return legalizePreloadedArgIntrin(MI, MRI, B,
4813                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4814   case Intrinsic::amdgcn_implicit_buffer_ptr:
4815     return legalizePreloadedArgIntrin(
4816       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4817   case Intrinsic::amdgcn_dispatch_id:
4818     return legalizePreloadedArgIntrin(MI, MRI, B,
4819                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4820   case Intrinsic::amdgcn_fdiv_fast:
4821     return legalizeFDIVFastIntrin(MI, MRI, B);
4822   case Intrinsic::amdgcn_is_shared:
4823     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4824   case Intrinsic::amdgcn_is_private:
4825     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4826   case Intrinsic::amdgcn_wavefrontsize: {
4827     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4828     MI.eraseFromParent();
4829     return true;
4830   }
4831   case Intrinsic::amdgcn_s_buffer_load:
4832     return legalizeSBufferLoad(Helper, MI);
4833   case Intrinsic::amdgcn_raw_buffer_store:
4834   case Intrinsic::amdgcn_struct_buffer_store:
4835     return legalizeBufferStore(MI, MRI, B, false, false);
4836   case Intrinsic::amdgcn_raw_buffer_store_format:
4837   case Intrinsic::amdgcn_struct_buffer_store_format:
4838     return legalizeBufferStore(MI, MRI, B, false, true);
4839   case Intrinsic::amdgcn_raw_tbuffer_store:
4840   case Intrinsic::amdgcn_struct_tbuffer_store:
4841     return legalizeBufferStore(MI, MRI, B, true, true);
4842   case Intrinsic::amdgcn_raw_buffer_load:
4843   case Intrinsic::amdgcn_struct_buffer_load:
4844     return legalizeBufferLoad(MI, MRI, B, false, false);
4845   case Intrinsic::amdgcn_raw_buffer_load_format:
4846   case Intrinsic::amdgcn_struct_buffer_load_format:
4847     return legalizeBufferLoad(MI, MRI, B, true, false);
4848   case Intrinsic::amdgcn_raw_tbuffer_load:
4849   case Intrinsic::amdgcn_struct_tbuffer_load:
4850     return legalizeBufferLoad(MI, MRI, B, true, true);
4851   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4852   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4853   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4854   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4855   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4856   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4857   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4858   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4859   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4860   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4861   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4862   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4863   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4864   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4865   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4866   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4867   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4868   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4869   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4870   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4871   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4872   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4873   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4874   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4875   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4876   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4877   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4878   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4879   case Intrinsic::amdgcn_buffer_atomic_fadd:
4880   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4881   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4882   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4883   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4884     return legalizeBufferAtomic(MI, B, IntrID);
4885   case Intrinsic::amdgcn_atomic_inc:
4886     return legalizeAtomicIncDec(MI, B, true);
4887   case Intrinsic::amdgcn_atomic_dec:
4888     return legalizeAtomicIncDec(MI, B, false);
4889   case Intrinsic::trap:
4890     return legalizeTrapIntrinsic(MI, MRI, B);
4891   case Intrinsic::debugtrap:
4892     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4893   case Intrinsic::amdgcn_rsq_clamp:
4894     return legalizeRsqClampIntrinsic(MI, MRI, B);
4895   case Intrinsic::amdgcn_ds_fadd:
4896   case Intrinsic::amdgcn_ds_fmin:
4897   case Intrinsic::amdgcn_ds_fmax:
4898     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4899   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4900     return legalizeBVHIntrinsic(MI, B);
4901   default: {
4902     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4903             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4904       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4905     return true;
4906   }
4907   }
4908 
4909   return true;
4910 }
4911