1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38   "amdgpu-global-isel-new-legality",
39   cl::desc("Use GlobalISel desired legality, rather than try to use"
40            "rules compatible with selection patterns"),
41   cl::init(false),
42   cl::ReallyHidden);
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     if (!Ty.isVector())
67       return false;
68 
69     const LLT EltTy = Ty.getElementType();
70     const unsigned EltSize = EltTy.getSizeInBits();
71     return Ty.getNumElements() % 2 != 0 &&
72            EltSize > 1 && EltSize < 32 &&
73            Ty.getSizeInBits() % 32 != 0;
74   };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78   return [=](const LegalityQuery &Query) {
79     const LLT Ty = Query.Types[TypeIdx];
80     return Ty.getSizeInBits() % 32 == 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130   const unsigned Size = Ty.getSizeInBits();
131 
132   LLT CoercedTy;
133   if (Size <= 32) {
134     // <2 x s8> -> s16
135     // <4 x s8> -> s32
136     return LLT::scalar(Size);
137   }
138 
139   return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143   return [=](const LegalityQuery &Query) {
144     const LLT Ty = Query.Types[TypeIdx];
145     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146   };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT Ty = Query.Types[TypeIdx];
152     unsigned Size = Ty.getSizeInBits();
153     assert(Size % 32 == 0);
154     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155   };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159   return [=](const LegalityQuery &Query) {
160     const LLT QueryTy = Query.Types[TypeIdx];
161     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162   };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166   return [=](const LegalityQuery &Query) {
167     const LLT QueryTy = Query.Types[TypeIdx];
168     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169   };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173   return [=](const LegalityQuery &Query) {
174     const LLT QueryTy = Query.Types[TypeIdx];
175     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176   };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180   return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184   const int EltSize = EltTy.getSizeInBits();
185   return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189   const int EltSize = Ty.getElementType().getSizeInBits();
190   return EltSize == 32 || EltSize == 64 ||
191          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192          EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196   if (!isRegisterSize(Ty.getSizeInBits()))
197     return false;
198 
199   if (Ty.isVector())
200     return isRegisterVectorType(Ty);
201 
202   return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208   return [=](const LegalityQuery &Query) {
209     return isRegisterType(Query.Types[TypeIdx]);
210   };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     const LLT QueryTy = Query.Types[TypeIdx];
216     if (!QueryTy.isVector())
217       return false;
218     const LLT EltTy = QueryTy.getElementType();
219     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220   };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224   return [=](const LegalityQuery &Query) {
225     const LLT Ty = Query.Types[TypeIdx];
226     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228   };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235                                     bool IsLoad) {
236   switch (AS) {
237   case AMDGPUAS::PRIVATE_ADDRESS:
238     // FIXME: Private element size.
239     return ST.enableFlatScratch() ? 128 : 32;
240   case AMDGPUAS::LOCAL_ADDRESS:
241     return ST.useDS128() ? 128 : 64;
242   case AMDGPUAS::GLOBAL_ADDRESS:
243   case AMDGPUAS::CONSTANT_ADDRESS:
244   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245     // Treat constant and global as identical. SMRD loads are sometimes usable for
246     // global loads (ideally constant address space should be eliminated)
247     // depending on the context. Legality cannot be context dependent, but
248     // RegBankSelect can split the load as necessary depending on the pointer
249     // register bank/uniformity and if the memory is invariant or not written in a
250     // kernel.
251     return IsLoad ? 512 : 128;
252   default:
253     // Flat addresses may contextually need to be split to 32-bit parts if they
254     // may alias scratch depending on the subtarget.
255     return 128;
256   }
257 }
258 
259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260                                  const LegalityQuery &Query,
261                                  unsigned Opcode) {
262   const LLT Ty = Query.Types[0];
263 
264   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265   const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267   unsigned RegSize = Ty.getSizeInBits();
268   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270   unsigned AS = Query.Types[1].getAddressSpace();
271 
272   // All of these need to be custom lowered to cast the pointer operand.
273   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274     return false;
275 
276   // TODO: We should be able to widen loads if the alignment is high enough, but
277   // we also need to modify the memory access size.
278 #if 0
279   // Accept widening loads based on alignment.
280   if (IsLoad && MemSize < Size)
281     MemSize = std::max(MemSize, Align);
282 #endif
283 
284   // Only 1-byte and 2-byte to 32-bit extloads are valid.
285   if (MemSize != RegSize && RegSize != 32)
286     return false;
287 
288   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289     return false;
290 
291   switch (MemSize) {
292   case 8:
293   case 16:
294   case 32:
295   case 64:
296   case 128:
297     break;
298   case 96:
299     if (!ST.hasDwordx3LoadStores())
300       return false;
301     break;
302   case 256:
303   case 512:
304     // These may contextually need to be broken down.
305     break;
306   default:
307     return false;
308   }
309 
310   assert(RegSize >= MemSize);
311 
312   if (AlignBits < MemSize) {
313     const SITargetLowering *TLI = ST.getTargetLowering();
314     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315                                                  Align(AlignBits / 8)))
316       return false;
317   }
318 
319   return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327   if (EnableNewLegality)
328     return false;
329 
330   const unsigned Size = Ty.getSizeInBits();
331   if (Size <= 64)
332     return false;
333   if (!Ty.isVector())
334     return true;
335 
336   LLT EltTy = Ty.getElementType();
337   if (EltTy.isPointer())
338     return true;
339 
340   unsigned EltSize = EltTy.getSizeInBits();
341   return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345                              unsigned Opcode) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583     .customFor({S32, S64})
584     .clampScalar(0, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .scalarize(0);
587 
588   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589                    .legalFor({S32})
590                    .maxScalarOrElt(0, S32);
591 
592   if (ST.hasVOP3PInsts()) {
593     Mulh
594       .clampMaxNumElements(0, S8, 2)
595       .lowerFor({V2S8});
596   }
597 
598   Mulh
599     .scalarize(0)
600     .lower();
601 
602   // Report legal for any types we can handle anywhere. For the cases only legal
603   // on the SALU, RegBankSelect will be able to re-legalize.
604   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606     .clampScalar(0, S32, S64)
607     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609     .widenScalarToNextPow2(0)
610     .scalarize(0);
611 
612   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614     .legalFor({{S32, S1}, {S32, S32}})
615     .minScalar(0, S32)
616     // TODO: .scalarize(0)
617     .lower();
618 
619   getActionDefinitionsBuilder(G_BITCAST)
620     // Don't worry about the size constraint.
621     .legalIf(all(isRegisterType(0), isRegisterType(1)))
622     .lower();
623 
624 
625   getActionDefinitionsBuilder(G_CONSTANT)
626     .legalFor({S1, S32, S64, S16, GlobalPtr,
627                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628     .legalIf(isPointer(0))
629     .clampScalar(0, S32, S64)
630     .widenScalarToNextPow2(0);
631 
632   getActionDefinitionsBuilder(G_FCONSTANT)
633     .legalFor({S32, S64, S16})
634     .clampScalar(0, S16, S64);
635 
636   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637       .legalIf(isRegisterType(0))
638       // s1 and s16 are special cases because they have legal operations on
639       // them, but don't really occupy registers in the normal way.
640       .legalFor({S1, S16})
641       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642       .clampScalarOrElt(0, S32, MaxScalar)
643       .widenScalarToNextPow2(0, 32)
644       .clampMaxNumElements(0, S32, 16);
645 
646   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648   // If the amount is divergent, we have to do a wave reduction to get the
649   // maximum value, so this is expanded during RegBankSelect.
650   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651     .legalFor({{PrivatePtr, S32}});
652 
653   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654     .customIf(typeIsNot(0, PrivatePtr));
655 
656   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658   auto &FPOpActions = getActionDefinitionsBuilder(
659     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660     .legalFor({S32, S64});
661   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662     .customFor({S32, S64});
663   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664     .customFor({S32, S64});
665 
666   if (ST.has16BitInsts()) {
667     if (ST.hasVOP3PInsts())
668       FPOpActions.legalFor({S16, V2S16});
669     else
670       FPOpActions.legalFor({S16});
671 
672     TrigActions.customFor({S16});
673     FDIVActions.customFor({S16});
674   }
675 
676   auto &MinNumMaxNum = getActionDefinitionsBuilder({
677       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679   if (ST.hasVOP3PInsts()) {
680     MinNumMaxNum.customFor(FPTypesPK16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .clampMaxNumElements(0, S16, 2)
683       .clampScalar(0, S16, S64)
684       .scalarize(0);
685   } else if (ST.has16BitInsts()) {
686     MinNumMaxNum.customFor(FPTypes16)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else {
690     MinNumMaxNum.customFor(FPTypesBase)
691       .clampScalar(0, S32, S64)
692       .scalarize(0);
693   }
694 
695   if (ST.hasVOP3PInsts())
696     FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698   FPOpActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   TrigActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   FDIVActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   getActionDefinitionsBuilder({G_FNEG, G_FABS})
711     .legalFor(FPTypesPK16)
712     .clampMaxNumElements(0, S16, 2)
713     .scalarize(0)
714     .clampScalar(0, S16, S64);
715 
716   if (ST.has16BitInsts()) {
717     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718       .legalFor({S32, S64, S16})
719       .scalarize(0)
720       .clampScalar(0, S16, S64);
721   } else {
722     getActionDefinitionsBuilder(G_FSQRT)
723       .legalFor({S32, S64})
724       .scalarize(0)
725       .clampScalar(0, S32, S64);
726 
727     if (ST.hasFractBug()) {
728       getActionDefinitionsBuilder(G_FFLOOR)
729         .customFor({S64})
730         .legalFor({S32, S64})
731         .scalarize(0)
732         .clampScalar(0, S32, S64);
733     } else {
734       getActionDefinitionsBuilder(G_FFLOOR)
735         .legalFor({S32, S64})
736         .scalarize(0)
737         .clampScalar(0, S32, S64);
738     }
739   }
740 
741   getActionDefinitionsBuilder(G_FPTRUNC)
742     .legalFor({{S32, S64}, {S16, S32}})
743     .scalarize(0)
744     .lower();
745 
746   getActionDefinitionsBuilder(G_FPEXT)
747     .legalFor({{S64, S32}, {S32, S16}})
748     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749     .scalarize(0);
750 
751   getActionDefinitionsBuilder(G_FSUB)
752       // Use actual fsub instruction
753       .legalFor({S32})
754       // Must use fadd + fneg
755       .lowerFor({S64, S16, V2S16})
756       .scalarize(0)
757       .clampScalar(0, S32, S64);
758 
759   // Whether this is legal depends on the floating point mode for the function.
760   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762     FMad.customFor({S32, S16});
763   else if (ST.hasMadMacF32Insts())
764     FMad.customFor({S32});
765   else if (ST.hasMadF16())
766     FMad.customFor({S16});
767   FMad.scalarize(0)
768       .lower();
769 
770   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771   if (ST.has16BitInsts()) {
772     FRem.customFor({S16, S32, S64});
773   } else {
774     FRem.minScalar(0, S32)
775         .customFor({S32, S64});
776   }
777   FRem.scalarize(0);
778 
779   // TODO: Do we need to clamp maximum bitwidth?
780   getActionDefinitionsBuilder(G_TRUNC)
781     .legalIf(isScalar(0))
782     .legalFor({{V2S16, V2S32}})
783     .clampMaxNumElements(0, S16, 2)
784     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785     // situations (like an invalid implicit use), we don't want to infinite loop
786     // in the legalizer.
787     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788     .alwaysLegal();
789 
790   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792                {S32, S1}, {S64, S1}, {S16, S1}})
793     .scalarize(0)
794     .clampScalar(0, S32, S64)
795     .widenScalarToNextPow2(1, 32);
796 
797   // TODO: Split s1->s64 during regbankselect for VALU.
798   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800     .lowerFor({{S32, S64}})
801     .lowerIf(typeIs(1, S1))
802     .customFor({{S64, S64}});
803   if (ST.has16BitInsts())
804     IToFP.legalFor({{S16, S16}});
805   IToFP.clampScalar(1, S32, S64)
806        .minScalar(0, S32)
807        .scalarize(0)
808        .widenScalarToNextPow2(1);
809 
810   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812     .customFor({{S64, S64}})
813     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814   if (ST.has16BitInsts())
815     FPToI.legalFor({{S16, S16}});
816   else
817     FPToI.minScalar(1, S32);
818 
819   FPToI.minScalar(0, S32)
820        .scalarize(0)
821        .lower();
822 
823   // Lower roundeven into G_FRINT
824   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825     .scalarize(0)
826     .lower();
827 
828   if (ST.has16BitInsts()) {
829     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830       .legalFor({S16, S32, S64})
831       .clampScalar(0, S16, S64)
832       .scalarize(0);
833   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835       .legalFor({S32, S64})
836       .clampScalar(0, S32, S64)
837       .scalarize(0);
838   } else {
839     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840       .legalFor({S32})
841       .customFor({S64})
842       .clampScalar(0, S32, S64)
843       .scalarize(0);
844   }
845 
846   getActionDefinitionsBuilder(G_PTR_ADD)
847     .legalIf(all(isPointer(0), sameSize(0, 1)))
848     .scalarize(0)
849     .scalarSameSizeAs(1, 0);
850 
851   getActionDefinitionsBuilder(G_PTRMASK)
852     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853     .scalarSameSizeAs(1, 0)
854     .scalarize(0);
855 
856   auto &CmpBuilder =
857     getActionDefinitionsBuilder(G_ICMP)
858     // The compare output type differs based on the register bank of the output,
859     // so make both s1 and s32 legal.
860     //
861     // Scalar compares producing output in scc will be promoted to s32, as that
862     // is the allocatable register type that will be needed for the copy from
863     // scc. This will be promoted during RegBankSelect, and we assume something
864     // before that won't try to use s32 result types.
865     //
866     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867     // bank.
868     .legalForCartesianProduct(
869       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870     .legalForCartesianProduct(
871       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872   if (ST.has16BitInsts()) {
873     CmpBuilder.legalFor({{S1, S16}});
874   }
875 
876   CmpBuilder
877     .widenScalarToNextPow2(1)
878     .clampScalar(1, S32, S64)
879     .scalarize(0)
880     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882   getActionDefinitionsBuilder(G_FCMP)
883     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884     .widenScalarToNextPow2(1)
885     .clampScalar(1, S32, S64)
886     .scalarize(0);
887 
888   // FIXME: fpow has a selection pattern that should move to custom lowering.
889   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890   if (ST.has16BitInsts())
891     Exp2Ops.legalFor({S32, S16});
892   else
893     Exp2Ops.legalFor({S32});
894   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895   Exp2Ops.scalarize(0);
896 
897   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898   if (ST.has16BitInsts())
899     ExpOps.customFor({{S32}, {S16}});
900   else
901     ExpOps.customFor({S32});
902   ExpOps.clampScalar(0, MinScalarFPTy, S32)
903         .scalarize(0);
904 
905   getActionDefinitionsBuilder(G_FPOWI)
906     .clampScalar(0, MinScalarFPTy, S32)
907     .lower();
908 
909   // The 64-bit versions produce 32-bit results, but only on the SALU.
910   getActionDefinitionsBuilder(G_CTPOP)
911     .legalFor({{S32, S32}, {S32, S64}})
912     .clampScalar(0, S32, S32)
913     .clampScalar(1, S32, S64)
914     .scalarize(0)
915     .widenScalarToNextPow2(0, 32)
916     .widenScalarToNextPow2(1, 32);
917 
918   // The hardware instructions return a different result on 0 than the generic
919   // instructions expect. The hardware produces -1, but these produce the
920   // bitwidth.
921   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922     .scalarize(0)
923     .clampScalar(0, S32, S32)
924     .clampScalar(1, S32, S64)
925     .widenScalarToNextPow2(0, 32)
926     .widenScalarToNextPow2(1, 32)
927     .lower();
928 
929   // The 64-bit versions produce 32-bit results, but only on the SALU.
930   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931     .legalFor({{S32, S32}, {S32, S64}})
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32)
936     .widenScalarToNextPow2(1, 32);
937 
938   getActionDefinitionsBuilder(G_BITREVERSE)
939     .legalFor({S32})
940     .clampScalar(0, S32, S32)
941     .scalarize(0);
942 
943   if (ST.has16BitInsts()) {
944     getActionDefinitionsBuilder(G_BSWAP)
945       .legalFor({S16, S32, V2S16})
946       .clampMaxNumElements(0, S16, 2)
947       // FIXME: Fixing non-power-of-2 before clamp is workaround for
948       // narrowScalar limitation.
949       .widenScalarToNextPow2(0)
950       .clampScalar(0, S16, S32)
951       .scalarize(0);
952 
953     if (ST.hasVOP3PInsts()) {
954       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955         .legalFor({S32, S16, V2S16})
956         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
957         .clampMaxNumElements(0, S16, 2)
958         .minScalar(0, S16)
959         .widenScalarToNextPow2(0)
960         .scalarize(0)
961         .lower();
962     } else {
963       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964         .legalFor({S32, S16})
965         .widenScalarToNextPow2(0)
966         .minScalar(0, S16)
967         .scalarize(0)
968         .lower();
969     }
970   } else {
971     // TODO: Should have same legality without v_perm_b32
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S32})
974       .lowerIf(scalarNarrowerThan(0, 32))
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .maxScalar(0, S32)
979       .scalarize(0)
980       .lower();
981 
982     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
983       .legalFor({S32})
984       .minScalar(0, S32)
985       .widenScalarToNextPow2(0)
986       .scalarize(0)
987       .lower();
988   }
989 
990   getActionDefinitionsBuilder(G_INTTOPTR)
991     // List the common cases
992     .legalForCartesianProduct(AddrSpaces64, {S64})
993     .legalForCartesianProduct(AddrSpaces32, {S32})
994     .scalarize(0)
995     // Accept any address space as long as the size matches
996     .legalIf(sameSize(0, 1))
997     .widenScalarIf(smallerThan(1, 0),
998       [](const LegalityQuery &Query) {
999         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1000       })
1001     .narrowScalarIf(largerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       });
1005 
1006   getActionDefinitionsBuilder(G_PTRTOINT)
1007     // List the common cases
1008     .legalForCartesianProduct(AddrSpaces64, {S64})
1009     .legalForCartesianProduct(AddrSpaces32, {S32})
1010     .scalarize(0)
1011     // Accept any address space as long as the size matches
1012     .legalIf(sameSize(0, 1))
1013     .widenScalarIf(smallerThan(0, 1),
1014       [](const LegalityQuery &Query) {
1015         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1016       })
1017     .narrowScalarIf(
1018       largerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       });
1022 
1023   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1024     .scalarize(0)
1025     .custom();
1026 
1027   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1028                                     bool IsLoad) -> bool {
1029     const LLT DstTy = Query.Types[0];
1030 
1031     // Split vector extloads.
1032     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034 
1035     if (MemSize < DstTy.getSizeInBits())
1036       MemSize = std::max(MemSize, AlignBits);
1037 
1038     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1039       return true;
1040 
1041     const LLT PtrTy = Query.Types[1];
1042     unsigned AS = PtrTy.getAddressSpace();
1043     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1044       return true;
1045 
1046     // Catch weird sized loads that don't evenly divide into the access sizes
1047     // TODO: May be able to widen depending on alignment etc.
1048     unsigned NumRegs = (MemSize + 31) / 32;
1049     if (NumRegs == 3) {
1050       if (!ST.hasDwordx3LoadStores())
1051         return true;
1052     } else {
1053       // If the alignment allows, these should have been widened.
1054       if (!isPowerOf2_32(NumRegs))
1055         return true;
1056     }
1057 
1058     if (AlignBits < MemSize) {
1059       const SITargetLowering *TLI = ST.getTargetLowering();
1060       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061                                                       Align(AlignBits / 8));
1062     }
1063 
1064     return false;
1065   };
1066 
1067   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1070 
1071   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1072   // LDS
1073   // TODO: Unsupported flat for SI.
1074 
1075   for (unsigned Op : {G_LOAD, G_STORE}) {
1076     const bool IsStore = Op == G_STORE;
1077 
1078     auto &Actions = getActionDefinitionsBuilder(Op);
1079     // Explicitly list some common cases.
1080     // TODO: Does this help compile time at all?
1081     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1083                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1084                                       {S64, GlobalPtr, 64, GlobalAlign32},
1085                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1086                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1087                                       {S32, GlobalPtr, 8, GlobalAlign8},
1088                                       {S32, GlobalPtr, 16, GlobalAlign16},
1089 
1090                                       {S32, LocalPtr, 32, 32},
1091                                       {S64, LocalPtr, 64, 32},
1092                                       {V2S32, LocalPtr, 64, 32},
1093                                       {S32, LocalPtr, 8, 8},
1094                                       {S32, LocalPtr, 16, 16},
1095                                       {V2S16, LocalPtr, 32, 32},
1096 
1097                                       {S32, PrivatePtr, 32, 32},
1098                                       {S32, PrivatePtr, 8, 8},
1099                                       {S32, PrivatePtr, 16, 16},
1100                                       {V2S16, PrivatePtr, 32, 32},
1101 
1102                                       {S32, ConstantPtr, 32, GlobalAlign32},
1103                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1104                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1105                                       {S64, ConstantPtr, 64, GlobalAlign32},
1106                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1107     Actions.legalIf(
1108       [=](const LegalityQuery &Query) -> bool {
1109         return isLoadStoreLegal(ST, Query, Op);
1110       });
1111 
1112     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1113     // 64-bits.
1114     //
1115     // TODO: Should generalize bitcast action into coerce, which will also cover
1116     // inserting addrspacecasts.
1117     Actions.customIf(typeIs(1, Constant32Ptr));
1118 
1119     // Turn any illegal element vectors into something easier to deal
1120     // with. These will ultimately produce 32-bit scalar shifts to extract the
1121     // parts anyway.
1122     //
1123     // For odd 16-bit element vectors, prefer to split those into pieces with
1124     // 16-bit vector parts.
1125     Actions.bitcastIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128                                           Query.MMODescrs[0].SizeInBits);
1129       }, bitcastToRegisterType(0));
1130 
1131     if (!IsStore) {
1132       // Widen suitably aligned loads by loading extra bytes. The standard
1133       // legalization actions can't properly express widening memory operands.
1134       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135         return shouldWidenLoad(ST, Query, G_LOAD);
1136       });
1137     }
1138 
1139     // FIXME: load/store narrowing should be moved to lower action
1140     Actions
1141         .narrowScalarIf(
1142             [=](const LegalityQuery &Query) -> bool {
1143               return !Query.Types[0].isVector() &&
1144                      needToSplitMemOp(Query, Op == G_LOAD);
1145             },
1146             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1147               const LLT DstTy = Query.Types[0];
1148               const LLT PtrTy = Query.Types[1];
1149 
1150               const unsigned DstSize = DstTy.getSizeInBits();
1151               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1152 
1153               // Split extloads.
1154               if (DstSize > MemSize)
1155                 return std::make_pair(0, LLT::scalar(MemSize));
1156 
1157               if (!isPowerOf2_32(DstSize)) {
1158                 // We're probably decomposing an odd sized store. Try to split
1159                 // to the widest type. TODO: Account for alignment. As-is it
1160                 // should be OK, since the new parts will be further legalized.
1161                 unsigned FloorSize = PowerOf2Floor(DstSize);
1162                 return std::make_pair(0, LLT::scalar(FloorSize));
1163               }
1164 
1165               if (DstSize > 32 && (DstSize % 32 != 0)) {
1166                 // FIXME: Need a way to specify non-extload of larger size if
1167                 // suitably aligned.
1168                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1169               }
1170 
1171               unsigned MaxSize = maxSizeForAddrSpace(ST,
1172                                                      PtrTy.getAddressSpace(),
1173                                                      Op == G_LOAD);
1174               if (MemSize > MaxSize)
1175                 return std::make_pair(0, LLT::scalar(MaxSize));
1176 
1177               unsigned Align = Query.MMODescrs[0].AlignInBits;
1178               return std::make_pair(0, LLT::scalar(Align));
1179             })
1180         .fewerElementsIf(
1181             [=](const LegalityQuery &Query) -> bool {
1182               return Query.Types[0].isVector() &&
1183                      needToSplitMemOp(Query, Op == G_LOAD);
1184             },
1185             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1186               const LLT DstTy = Query.Types[0];
1187               const LLT PtrTy = Query.Types[1];
1188 
1189               LLT EltTy = DstTy.getElementType();
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193 
1194               // FIXME: Handle widened to power of 2 results better. This ends
1195               // up scalarizing.
1196               // FIXME: 3 element stores scalarized on SI
1197 
1198               // Split if it's too large for the address space.
1199               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(0,
1217                                       LLT::vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1234               }
1235 
1236               // Need to split because of alignment.
1237               unsigned Align = Query.MMODescrs[0].AlignInBits;
1238               if (EltSize > Align &&
1239                   (EltSize / Align < DstTy.getNumElements())) {
1240                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1241               }
1242 
1243               // May need relegalization for the scalars.
1244               return std::make_pair(0, EltTy);
1245             })
1246     .lowerIfMemSizeNotPow2()
1247     .minScalar(0, S32);
1248 
1249     if (IsStore)
1250       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1251 
1252     Actions
1253         .widenScalarToNextPow2(0)
1254         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255         .lower();
1256   }
1257 
1258   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260                                                   {S32, GlobalPtr, 16, 2 * 8},
1261                                                   {S32, LocalPtr, 8, 8},
1262                                                   {S32, LocalPtr, 16, 16},
1263                                                   {S32, PrivatePtr, 8, 8},
1264                                                   {S32, PrivatePtr, 16, 16},
1265                                                   {S32, ConstantPtr, 8, 8},
1266                                                   {S32, ConstantPtr, 16, 2 * 8}});
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1270   }
1271 
1272   ExtLoads.clampScalar(0, S32, S32)
1273           .widenScalarToNextPow2(0)
1274           .unsupportedIfMemSizeNotPow2()
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   if (ST.hasLDSFPAtomics()) {
1290     auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292     if (ST.hasGFX90AInsts())
1293       Atomic.legalFor({{S64, LocalPtr}});
1294   }
1295 
1296   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1297   // demarshalling
1298   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1299     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1300                 {S32, FlatPtr}, {S64, FlatPtr}})
1301     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1302                {S32, RegionPtr}, {S64, RegionPtr}});
1303   // TODO: Pointer types, any 32-bit or 64-bit vector
1304 
1305   // Condition should be s32 for scalar, s1 for vector.
1306   getActionDefinitionsBuilder(G_SELECT)
1307     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1308           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1309           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1310     .clampScalar(0, S16, S64)
1311     .scalarize(1)
1312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1313     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1314     .clampMaxNumElements(0, S32, 2)
1315     .clampMaxNumElements(0, LocalPtr, 2)
1316     .clampMaxNumElements(0, PrivatePtr, 2)
1317     .scalarize(0)
1318     .widenScalarToNextPow2(0)
1319     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1320 
1321   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1322   // be more flexible with the shift amount type.
1323   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1324     .legalFor({{S32, S32}, {S64, S32}});
1325   if (ST.has16BitInsts()) {
1326     if (ST.hasVOP3PInsts()) {
1327       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1328             .clampMaxNumElements(0, S16, 2);
1329     } else
1330       Shifts.legalFor({{S16, S16}});
1331 
1332     // TODO: Support 16-bit shift amounts for all types
1333     Shifts.widenScalarIf(
1334       [=](const LegalityQuery &Query) {
1335         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1336         // 32-bit amount.
1337         const LLT ValTy = Query.Types[0];
1338         const LLT AmountTy = Query.Types[1];
1339         return ValTy.getSizeInBits() <= 16 &&
1340                AmountTy.getSizeInBits() < 16;
1341       }, changeTo(1, S16));
1342     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1343     Shifts.clampScalar(1, S32, S32);
1344     Shifts.clampScalar(0, S16, S64);
1345     Shifts.widenScalarToNextPow2(0, 16);
1346 
1347     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1348       .minScalar(0, S16)
1349       .scalarize(0)
1350       .lower();
1351   } else {
1352     // Make sure we legalize the shift amount type first, as the general
1353     // expansion for the shifted type will produce much worse code if it hasn't
1354     // been truncated already.
1355     Shifts.clampScalar(1, S32, S32);
1356     Shifts.clampScalar(0, S32, S64);
1357     Shifts.widenScalarToNextPow2(0, 32);
1358 
1359     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1360       .minScalar(0, S32)
1361       .scalarize(0)
1362       .lower();
1363   }
1364   Shifts.scalarize(0);
1365 
1366   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1367     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1368     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1369     unsigned IdxTypeIdx = 2;
1370 
1371     getActionDefinitionsBuilder(Op)
1372       .customIf([=](const LegalityQuery &Query) {
1373           const LLT EltTy = Query.Types[EltTypeIdx];
1374           const LLT VecTy = Query.Types[VecTypeIdx];
1375           const LLT IdxTy = Query.Types[IdxTypeIdx];
1376           const unsigned EltSize = EltTy.getSizeInBits();
1377           return (EltSize == 32 || EltSize == 64) &&
1378                   VecTy.getSizeInBits() % 32 == 0 &&
1379                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1380                   IdxTy.getSizeInBits() == 32;
1381         })
1382       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1383                  bitcastToVectorElement32(VecTypeIdx))
1384       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1385       .bitcastIf(
1386         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1387         [=](const LegalityQuery &Query) {
1388           // For > 64-bit element types, try to turn this into a 64-bit
1389           // element vector since we may be able to do better indexing
1390           // if this is scalar. If not, fall back to 32.
1391           const LLT EltTy = Query.Types[EltTypeIdx];
1392           const LLT VecTy = Query.Types[VecTypeIdx];
1393           const unsigned DstEltSize = EltTy.getSizeInBits();
1394           const unsigned VecSize = VecTy.getSizeInBits();
1395 
1396           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1397           return std::make_pair(
1398             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1399         })
1400       .clampScalar(EltTypeIdx, S32, S64)
1401       .clampScalar(VecTypeIdx, S32, S64)
1402       .clampScalar(IdxTypeIdx, S32, S32)
1403       .clampMaxNumElements(VecTypeIdx, S32, 32)
1404       // TODO: Clamp elements for 64-bit vectors?
1405       // It should only be necessary with variable indexes.
1406       // As a last resort, lower to the stack
1407       .lower();
1408   }
1409 
1410   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1411     .unsupportedIf([=](const LegalityQuery &Query) {
1412         const LLT &EltTy = Query.Types[1].getElementType();
1413         return Query.Types[0] != EltTy;
1414       });
1415 
1416   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1417     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1418     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1419 
1420     // FIXME: Doesn't handle extract of illegal sizes.
1421     getActionDefinitionsBuilder(Op)
1422       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1423       // FIXME: Multiples of 16 should not be legal.
1424       .legalIf([=](const LegalityQuery &Query) {
1425           const LLT BigTy = Query.Types[BigTyIdx];
1426           const LLT LitTy = Query.Types[LitTyIdx];
1427           return (BigTy.getSizeInBits() % 32 == 0) &&
1428                  (LitTy.getSizeInBits() % 16 == 0);
1429         })
1430       .widenScalarIf(
1431         [=](const LegalityQuery &Query) {
1432           const LLT BigTy = Query.Types[BigTyIdx];
1433           return (BigTy.getScalarSizeInBits() < 16);
1434         },
1435         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1436       .widenScalarIf(
1437         [=](const LegalityQuery &Query) {
1438           const LLT LitTy = Query.Types[LitTyIdx];
1439           return (LitTy.getScalarSizeInBits() < 16);
1440         },
1441         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1442       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1443       .widenScalarToNextPow2(BigTyIdx, 32);
1444 
1445   }
1446 
1447   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1448     .legalForCartesianProduct(AllS32Vectors, {S32})
1449     .legalForCartesianProduct(AllS64Vectors, {S64})
1450     .clampNumElements(0, V16S32, V32S32)
1451     .clampNumElements(0, V2S64, V16S64)
1452     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1453 
1454   if (ST.hasScalarPackInsts()) {
1455     BuildVector
1456       // FIXME: Should probably widen s1 vectors straight to s32
1457       .minScalarOrElt(0, S16)
1458       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1459       .minScalar(1, S32);
1460 
1461     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1462       .legalFor({V2S16, S32})
1463       .lower();
1464     BuildVector.minScalarOrElt(0, S32);
1465   } else {
1466     BuildVector.customFor({V2S16, S16});
1467     BuildVector.minScalarOrElt(0, S32);
1468 
1469     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1470       .customFor({V2S16, S32})
1471       .lower();
1472   }
1473 
1474   BuildVector.legalIf(isRegisterType(0));
1475 
1476   // FIXME: Clamp maximum size
1477   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1478     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1479     .clampMaxNumElements(0, S32, 32)
1480     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1481     .clampMaxNumElements(0, S16, 64);
1482 
1483   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1484   // pre-legalize.
1485   if (ST.hasVOP3PInsts()) {
1486     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1487       .customFor({V2S16, V2S16})
1488       .lower();
1489   } else
1490     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1491 
1492   // Merge/Unmerge
1493   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1494     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1495     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1496 
1497     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1498       const LLT Ty = Query.Types[TypeIdx];
1499       if (Ty.isVector()) {
1500         const LLT &EltTy = Ty.getElementType();
1501         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1502           return true;
1503         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1504           return true;
1505       }
1506       return false;
1507     };
1508 
1509     auto &Builder = getActionDefinitionsBuilder(Op)
1510       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1511       .lowerFor({{S16, V2S16}})
1512       .lowerIf([=](const LegalityQuery &Query) {
1513           const LLT BigTy = Query.Types[BigTyIdx];
1514           return BigTy.getSizeInBits() == 32;
1515         })
1516       // Try to widen to s16 first for small types.
1517       // TODO: Only do this on targets with legal s16 shifts
1518       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1519       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1520       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1521       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1522                            elementTypeIs(1, S16)),
1523                        changeTo(1, V2S16))
1524       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1525       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1526       // valid.
1527       .clampScalar(LitTyIdx, S32, S512)
1528       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1529       // Break up vectors with weird elements into scalars
1530       .fewerElementsIf(
1531         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1532         scalarize(0))
1533       .fewerElementsIf(
1534         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1535         scalarize(1))
1536       .clampScalar(BigTyIdx, S32, MaxScalar);
1537 
1538     if (Op == G_MERGE_VALUES) {
1539       Builder.widenScalarIf(
1540         // TODO: Use 16-bit shifts if legal for 8-bit values?
1541         [=](const LegalityQuery &Query) {
1542           const LLT Ty = Query.Types[LitTyIdx];
1543           return Ty.getSizeInBits() < 32;
1544         },
1545         changeTo(LitTyIdx, S32));
1546     }
1547 
1548     Builder.widenScalarIf(
1549       [=](const LegalityQuery &Query) {
1550         const LLT Ty = Query.Types[BigTyIdx];
1551         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1552           Ty.getSizeInBits() % 16 != 0;
1553       },
1554       [=](const LegalityQuery &Query) {
1555         // Pick the next power of 2, or a multiple of 64 over 128.
1556         // Whichever is smaller.
1557         const LLT &Ty = Query.Types[BigTyIdx];
1558         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1559         if (NewSizeInBits >= 256) {
1560           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1561           if (RoundedTo < NewSizeInBits)
1562             NewSizeInBits = RoundedTo;
1563         }
1564         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1565       })
1566       // Any vectors left are the wrong size. Scalarize them.
1567       .scalarize(0)
1568       .scalarize(1);
1569   }
1570 
1571   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1572   // RegBankSelect.
1573   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1574     .legalFor({{S32}, {S64}});
1575 
1576   if (ST.hasVOP3PInsts()) {
1577     SextInReg.lowerFor({{V2S16}})
1578       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1579       // get more vector shift opportunities, since we'll get those when
1580       // expanded.
1581       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1582   } else if (ST.has16BitInsts()) {
1583     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1584   } else {
1585     // Prefer to promote to s32 before lowering if we don't have 16-bit
1586     // shifts. This avoid a lot of intermediate truncate and extend operations.
1587     SextInReg.lowerFor({{S32}, {S64}});
1588   }
1589 
1590   SextInReg
1591     .scalarize(0)
1592     .clampScalar(0, S32, S64)
1593     .lower();
1594 
1595   getActionDefinitionsBuilder(G_FSHR)
1596     .legalFor({{S32, S32}})
1597     .scalarize(0)
1598     .lower();
1599 
1600   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1601     .legalFor({S64});
1602 
1603   getActionDefinitionsBuilder(G_FENCE)
1604     .alwaysLegal();
1605 
1606   getActionDefinitionsBuilder({
1607       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1608       G_FCOPYSIGN,
1609 
1610       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1611       G_ATOMICRMW_NAND,
1612       G_ATOMICRMW_FSUB,
1613       G_READ_REGISTER,
1614       G_WRITE_REGISTER,
1615 
1616       G_SADDO, G_SSUBO,
1617 
1618        // TODO: Implement
1619       G_FMINIMUM, G_FMAXIMUM,
1620       G_FSHL
1621     }).lower();
1622 
1623   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1624         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1625         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1626     .unsupported();
1627 
1628   computeTables();
1629   verify(*ST.getInstrInfo());
1630 }
1631 
1632 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1633                                          MachineInstr &MI) const {
1634   MachineIRBuilder &B = Helper.MIRBuilder;
1635   MachineRegisterInfo &MRI = *B.getMRI();
1636 
1637   switch (MI.getOpcode()) {
1638   case TargetOpcode::G_ADDRSPACE_CAST:
1639     return legalizeAddrSpaceCast(MI, MRI, B);
1640   case TargetOpcode::G_FRINT:
1641     return legalizeFrint(MI, MRI, B);
1642   case TargetOpcode::G_FCEIL:
1643     return legalizeFceil(MI, MRI, B);
1644   case TargetOpcode::G_FREM:
1645     return legalizeFrem(MI, MRI, B);
1646   case TargetOpcode::G_INTRINSIC_TRUNC:
1647     return legalizeIntrinsicTrunc(MI, MRI, B);
1648   case TargetOpcode::G_SITOFP:
1649     return legalizeITOFP(MI, MRI, B, true);
1650   case TargetOpcode::G_UITOFP:
1651     return legalizeITOFP(MI, MRI, B, false);
1652   case TargetOpcode::G_FPTOSI:
1653     return legalizeFPTOI(MI, MRI, B, true);
1654   case TargetOpcode::G_FPTOUI:
1655     return legalizeFPTOI(MI, MRI, B, false);
1656   case TargetOpcode::G_FMINNUM:
1657   case TargetOpcode::G_FMAXNUM:
1658   case TargetOpcode::G_FMINNUM_IEEE:
1659   case TargetOpcode::G_FMAXNUM_IEEE:
1660     return legalizeMinNumMaxNum(Helper, MI);
1661   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1662     return legalizeExtractVectorElt(MI, MRI, B);
1663   case TargetOpcode::G_INSERT_VECTOR_ELT:
1664     return legalizeInsertVectorElt(MI, MRI, B);
1665   case TargetOpcode::G_SHUFFLE_VECTOR:
1666     return legalizeShuffleVector(MI, MRI, B);
1667   case TargetOpcode::G_FSIN:
1668   case TargetOpcode::G_FCOS:
1669     return legalizeSinCos(MI, MRI, B);
1670   case TargetOpcode::G_GLOBAL_VALUE:
1671     return legalizeGlobalValue(MI, MRI, B);
1672   case TargetOpcode::G_LOAD:
1673     return legalizeLoad(Helper, MI);
1674   case TargetOpcode::G_FMAD:
1675     return legalizeFMad(MI, MRI, B);
1676   case TargetOpcode::G_FDIV:
1677     return legalizeFDIV(MI, MRI, B);
1678   case TargetOpcode::G_UDIV:
1679   case TargetOpcode::G_UREM:
1680     return legalizeUDIV_UREM(MI, MRI, B);
1681   case TargetOpcode::G_SDIV:
1682   case TargetOpcode::G_SREM:
1683     return legalizeSDIV_SREM(MI, MRI, B);
1684   case TargetOpcode::G_ATOMIC_CMPXCHG:
1685     return legalizeAtomicCmpXChg(MI, MRI, B);
1686   case TargetOpcode::G_FLOG:
1687     return legalizeFlog(MI, B, numbers::ln2f);
1688   case TargetOpcode::G_FLOG10:
1689     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1690   case TargetOpcode::G_FEXP:
1691     return legalizeFExp(MI, B);
1692   case TargetOpcode::G_FPOW:
1693     return legalizeFPow(MI, B);
1694   case TargetOpcode::G_FFLOOR:
1695     return legalizeFFloor(MI, MRI, B);
1696   case TargetOpcode::G_BUILD_VECTOR:
1697     return legalizeBuildVector(MI, MRI, B);
1698   default:
1699     return false;
1700   }
1701 
1702   llvm_unreachable("expected switch to return");
1703 }
1704 
1705 Register AMDGPULegalizerInfo::getSegmentAperture(
1706   unsigned AS,
1707   MachineRegisterInfo &MRI,
1708   MachineIRBuilder &B) const {
1709   MachineFunction &MF = B.getMF();
1710   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1711   const LLT S32 = LLT::scalar(32);
1712 
1713   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1714 
1715   if (ST.hasApertureRegs()) {
1716     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1717     // getreg.
1718     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1719         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1720         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1721     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1722         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1723         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1724     unsigned Encoding =
1725         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1726         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1727         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1728 
1729     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1730 
1731     B.buildInstr(AMDGPU::S_GETREG_B32)
1732       .addDef(GetReg)
1733       .addImm(Encoding);
1734     MRI.setType(GetReg, S32);
1735 
1736     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1737     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1738   }
1739 
1740   Register QueuePtr = MRI.createGenericVirtualRegister(
1741     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1742 
1743   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1744     return Register();
1745 
1746   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1747   // private_segment_aperture_base_hi.
1748   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1749 
1750   // TODO: can we be smarter about machine pointer info?
1751   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1752   MachineMemOperand *MMO = MF.getMachineMemOperand(
1753       PtrInfo,
1754       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1755           MachineMemOperand::MOInvariant,
1756       4, commonAlignment(Align(64), StructOffset));
1757 
1758   Register LoadAddr;
1759 
1760   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1761   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1762 }
1763 
1764 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1765   MachineInstr &MI, MachineRegisterInfo &MRI,
1766   MachineIRBuilder &B) const {
1767   MachineFunction &MF = B.getMF();
1768 
1769   const LLT S32 = LLT::scalar(32);
1770   Register Dst = MI.getOperand(0).getReg();
1771   Register Src = MI.getOperand(1).getReg();
1772 
1773   LLT DstTy = MRI.getType(Dst);
1774   LLT SrcTy = MRI.getType(Src);
1775   unsigned DestAS = DstTy.getAddressSpace();
1776   unsigned SrcAS = SrcTy.getAddressSpace();
1777 
1778   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1779   // vector element.
1780   assert(!DstTy.isVector());
1781 
1782   const AMDGPUTargetMachine &TM
1783     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1784 
1785   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1786     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1787     return true;
1788   }
1789 
1790   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1791     // Truncate.
1792     B.buildExtract(Dst, Src, 0);
1793     MI.eraseFromParent();
1794     return true;
1795   }
1796 
1797   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1798     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1799     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1800 
1801     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1802     // another. Merge operands are required to be the same type, but creating an
1803     // extra ptrtoint would be kind of pointless.
1804     auto HighAddr = B.buildConstant(
1805       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1806     B.buildMerge(Dst, {Src, HighAddr});
1807     MI.eraseFromParent();
1808     return true;
1809   }
1810 
1811   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1812     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1813            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1814     unsigned NullVal = TM.getNullPointerValue(DestAS);
1815 
1816     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1817     auto FlatNull = B.buildConstant(SrcTy, 0);
1818 
1819     // Extract low 32-bits of the pointer.
1820     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1821 
1822     auto CmpRes =
1823         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1824     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1825 
1826     MI.eraseFromParent();
1827     return true;
1828   }
1829 
1830   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1831     return false;
1832 
1833   if (!ST.hasFlatAddressSpace())
1834     return false;
1835 
1836   auto SegmentNull =
1837       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1838   auto FlatNull =
1839       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1840 
1841   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1842   if (!ApertureReg.isValid())
1843     return false;
1844 
1845   auto CmpRes =
1846       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1847 
1848   // Coerce the type of the low half of the result so we can use merge_values.
1849   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1850 
1851   // TODO: Should we allow mismatched types but matching sizes in merges to
1852   // avoid the ptrtoint?
1853   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1854   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1855 
1856   MI.eraseFromParent();
1857   return true;
1858 }
1859 
1860 bool AMDGPULegalizerInfo::legalizeFrint(
1861   MachineInstr &MI, MachineRegisterInfo &MRI,
1862   MachineIRBuilder &B) const {
1863   Register Src = MI.getOperand(1).getReg();
1864   LLT Ty = MRI.getType(Src);
1865   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1866 
1867   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1868   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1869 
1870   auto C1 = B.buildFConstant(Ty, C1Val);
1871   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1872 
1873   // TODO: Should this propagate fast-math-flags?
1874   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1875   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1876 
1877   auto C2 = B.buildFConstant(Ty, C2Val);
1878   auto Fabs = B.buildFAbs(Ty, Src);
1879 
1880   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1881   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1882   MI.eraseFromParent();
1883   return true;
1884 }
1885 
1886 bool AMDGPULegalizerInfo::legalizeFceil(
1887   MachineInstr &MI, MachineRegisterInfo &MRI,
1888   MachineIRBuilder &B) const {
1889 
1890   const LLT S1 = LLT::scalar(1);
1891   const LLT S64 = LLT::scalar(64);
1892 
1893   Register Src = MI.getOperand(1).getReg();
1894   assert(MRI.getType(Src) == S64);
1895 
1896   // result = trunc(src)
1897   // if (src > 0.0 && src != result)
1898   //   result += 1.0
1899 
1900   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1901 
1902   const auto Zero = B.buildFConstant(S64, 0.0);
1903   const auto One = B.buildFConstant(S64, 1.0);
1904   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1905   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1906   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1907   auto Add = B.buildSelect(S64, And, One, Zero);
1908 
1909   // TODO: Should this propagate fast-math-flags?
1910   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1911   return true;
1912 }
1913 
1914 bool AMDGPULegalizerInfo::legalizeFrem(
1915   MachineInstr &MI, MachineRegisterInfo &MRI,
1916   MachineIRBuilder &B) const {
1917     Register DstReg = MI.getOperand(0).getReg();
1918     Register Src0Reg = MI.getOperand(1).getReg();
1919     Register Src1Reg = MI.getOperand(2).getReg();
1920     auto Flags = MI.getFlags();
1921     LLT Ty = MRI.getType(DstReg);
1922 
1923     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1924     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1925     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1926     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1927     MI.eraseFromParent();
1928     return true;
1929 }
1930 
1931 static MachineInstrBuilder extractF64Exponent(Register Hi,
1932                                               MachineIRBuilder &B) {
1933   const unsigned FractBits = 52;
1934   const unsigned ExpBits = 11;
1935   LLT S32 = LLT::scalar(32);
1936 
1937   auto Const0 = B.buildConstant(S32, FractBits - 32);
1938   auto Const1 = B.buildConstant(S32, ExpBits);
1939 
1940   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1941     .addUse(Hi)
1942     .addUse(Const0.getReg(0))
1943     .addUse(Const1.getReg(0));
1944 
1945   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1946 }
1947 
1948 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1949   MachineInstr &MI, MachineRegisterInfo &MRI,
1950   MachineIRBuilder &B) const {
1951   const LLT S1 = LLT::scalar(1);
1952   const LLT S32 = LLT::scalar(32);
1953   const LLT S64 = LLT::scalar(64);
1954 
1955   Register Src = MI.getOperand(1).getReg();
1956   assert(MRI.getType(Src) == S64);
1957 
1958   // TODO: Should this use extract since the low half is unused?
1959   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1960   Register Hi = Unmerge.getReg(1);
1961 
1962   // Extract the upper half, since this is where we will find the sign and
1963   // exponent.
1964   auto Exp = extractF64Exponent(Hi, B);
1965 
1966   const unsigned FractBits = 52;
1967 
1968   // Extract the sign bit.
1969   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1970   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1971 
1972   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1973 
1974   const auto Zero32 = B.buildConstant(S32, 0);
1975 
1976   // Extend back to 64-bits.
1977   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1978 
1979   auto Shr = B.buildAShr(S64, FractMask, Exp);
1980   auto Not = B.buildNot(S64, Shr);
1981   auto Tmp0 = B.buildAnd(S64, Src, Not);
1982   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1983 
1984   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1985   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1986 
1987   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1988   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1989   MI.eraseFromParent();
1990   return true;
1991 }
1992 
1993 bool AMDGPULegalizerInfo::legalizeITOFP(
1994   MachineInstr &MI, MachineRegisterInfo &MRI,
1995   MachineIRBuilder &B, bool Signed) const {
1996 
1997   Register Dst = MI.getOperand(0).getReg();
1998   Register Src = MI.getOperand(1).getReg();
1999 
2000   const LLT S64 = LLT::scalar(64);
2001   const LLT S32 = LLT::scalar(32);
2002 
2003   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2004 
2005   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2006 
2007   auto CvtHi = Signed ?
2008     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2009     B.buildUITOFP(S64, Unmerge.getReg(1));
2010 
2011   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2012 
2013   auto ThirtyTwo = B.buildConstant(S32, 32);
2014   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2015     .addUse(CvtHi.getReg(0))
2016     .addUse(ThirtyTwo.getReg(0));
2017 
2018   // TODO: Should this propagate fast-math-flags?
2019   B.buildFAdd(Dst, LdExp, CvtLo);
2020   MI.eraseFromParent();
2021   return true;
2022 }
2023 
2024 // TODO: Copied from DAG implementation. Verify logic and document how this
2025 // actually works.
2026 bool AMDGPULegalizerInfo::legalizeFPTOI(
2027   MachineInstr &MI, MachineRegisterInfo &MRI,
2028   MachineIRBuilder &B, bool Signed) const {
2029 
2030   Register Dst = MI.getOperand(0).getReg();
2031   Register Src = MI.getOperand(1).getReg();
2032 
2033   const LLT S64 = LLT::scalar(64);
2034   const LLT S32 = LLT::scalar(32);
2035 
2036   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2037 
2038   unsigned Flags = MI.getFlags();
2039 
2040   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2041   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2042   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2043 
2044   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2045   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2046   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2047 
2048   auto Hi = Signed ?
2049     B.buildFPTOSI(S32, FloorMul) :
2050     B.buildFPTOUI(S32, FloorMul);
2051   auto Lo = B.buildFPTOUI(S32, Fma);
2052 
2053   B.buildMerge(Dst, { Lo, Hi });
2054   MI.eraseFromParent();
2055 
2056   return true;
2057 }
2058 
2059 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2060                                                MachineInstr &MI) const {
2061   MachineFunction &MF = Helper.MIRBuilder.getMF();
2062   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2063 
2064   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2065                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2066 
2067   // With ieee_mode disabled, the instructions have the correct behavior
2068   // already for G_FMINNUM/G_FMAXNUM
2069   if (!MFI->getMode().IEEE)
2070     return !IsIEEEOp;
2071 
2072   if (IsIEEEOp)
2073     return true;
2074 
2075   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2076 }
2077 
2078 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2079   MachineInstr &MI, MachineRegisterInfo &MRI,
2080   MachineIRBuilder &B) const {
2081   // TODO: Should move some of this into LegalizerHelper.
2082 
2083   // TODO: Promote dynamic indexing of s16 to s32
2084 
2085   // FIXME: Artifact combiner probably should have replaced the truncated
2086   // constant before this, so we shouldn't need
2087   // getConstantVRegValWithLookThrough.
2088   Optional<ValueAndVReg> MaybeIdxVal =
2089       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2090   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2091     return true;
2092   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2093 
2094   Register Dst = MI.getOperand(0).getReg();
2095   Register Vec = MI.getOperand(1).getReg();
2096 
2097   LLT VecTy = MRI.getType(Vec);
2098   LLT EltTy = VecTy.getElementType();
2099   assert(EltTy == MRI.getType(Dst));
2100 
2101   if (IdxVal < VecTy.getNumElements())
2102     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2103   else
2104     B.buildUndef(Dst);
2105 
2106   MI.eraseFromParent();
2107   return true;
2108 }
2109 
2110 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2111   MachineInstr &MI, MachineRegisterInfo &MRI,
2112   MachineIRBuilder &B) const {
2113   // TODO: Should move some of this into LegalizerHelper.
2114 
2115   // TODO: Promote dynamic indexing of s16 to s32
2116 
2117   // FIXME: Artifact combiner probably should have replaced the truncated
2118   // constant before this, so we shouldn't need
2119   // getConstantVRegValWithLookThrough.
2120   Optional<ValueAndVReg> MaybeIdxVal =
2121       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2122   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2123     return true;
2124 
2125   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2126   Register Dst = MI.getOperand(0).getReg();
2127   Register Vec = MI.getOperand(1).getReg();
2128   Register Ins = MI.getOperand(2).getReg();
2129 
2130   LLT VecTy = MRI.getType(Vec);
2131   LLT EltTy = VecTy.getElementType();
2132   assert(EltTy == MRI.getType(Ins));
2133 
2134   if (IdxVal < VecTy.getNumElements())
2135     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2136   else
2137     B.buildUndef(Dst);
2138 
2139   MI.eraseFromParent();
2140   return true;
2141 }
2142 
2143 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2144   MachineInstr &MI, MachineRegisterInfo &MRI,
2145   MachineIRBuilder &B) const {
2146   const LLT V2S16 = LLT::vector(2, 16);
2147 
2148   Register Dst = MI.getOperand(0).getReg();
2149   Register Src0 = MI.getOperand(1).getReg();
2150   LLT DstTy = MRI.getType(Dst);
2151   LLT SrcTy = MRI.getType(Src0);
2152 
2153   if (SrcTy == V2S16 && DstTy == V2S16 &&
2154       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2155     return true;
2156 
2157   MachineIRBuilder HelperBuilder(MI);
2158   GISelObserverWrapper DummyObserver;
2159   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2160   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2161 }
2162 
2163 bool AMDGPULegalizerInfo::legalizeSinCos(
2164   MachineInstr &MI, MachineRegisterInfo &MRI,
2165   MachineIRBuilder &B) const {
2166 
2167   Register DstReg = MI.getOperand(0).getReg();
2168   Register SrcReg = MI.getOperand(1).getReg();
2169   LLT Ty = MRI.getType(DstReg);
2170   unsigned Flags = MI.getFlags();
2171 
2172   Register TrigVal;
2173   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2174   if (ST.hasTrigReducedRange()) {
2175     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2176     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2177       .addUse(MulVal.getReg(0))
2178       .setMIFlags(Flags).getReg(0);
2179   } else
2180     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2181 
2182   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2183     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2184   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2185     .addUse(TrigVal)
2186     .setMIFlags(Flags);
2187   MI.eraseFromParent();
2188   return true;
2189 }
2190 
2191 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2192                                                   MachineIRBuilder &B,
2193                                                   const GlobalValue *GV,
2194                                                   int64_t Offset,
2195                                                   unsigned GAFlags) const {
2196   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2197   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2198   // to the following code sequence:
2199   //
2200   // For constant address space:
2201   //   s_getpc_b64 s[0:1]
2202   //   s_add_u32 s0, s0, $symbol
2203   //   s_addc_u32 s1, s1, 0
2204   //
2205   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2206   //   a fixup or relocation is emitted to replace $symbol with a literal
2207   //   constant, which is a pc-relative offset from the encoding of the $symbol
2208   //   operand to the global variable.
2209   //
2210   // For global address space:
2211   //   s_getpc_b64 s[0:1]
2212   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2213   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2214   //
2215   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2216   //   fixups or relocations are emitted to replace $symbol@*@lo and
2217   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2218   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2219   //   operand to the global variable.
2220   //
2221   // What we want here is an offset from the value returned by s_getpc
2222   // (which is the address of the s_add_u32 instruction) to the global
2223   // variable, but since the encoding of $symbol starts 4 bytes after the start
2224   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2225   // small. This requires us to add 4 to the global variable offset in order to
2226   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2227   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2228   // instruction.
2229 
2230   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2231 
2232   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2233     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2234 
2235   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2236     .addDef(PCReg);
2237 
2238   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2239   if (GAFlags == SIInstrInfo::MO_NONE)
2240     MIB.addImm(0);
2241   else
2242     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2243 
2244   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2245 
2246   if (PtrTy.getSizeInBits() == 32)
2247     B.buildExtract(DstReg, PCReg, 0);
2248   return true;
2249  }
2250 
2251 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2252   MachineInstr &MI, MachineRegisterInfo &MRI,
2253   MachineIRBuilder &B) const {
2254   Register DstReg = MI.getOperand(0).getReg();
2255   LLT Ty = MRI.getType(DstReg);
2256   unsigned AS = Ty.getAddressSpace();
2257 
2258   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2259   MachineFunction &MF = B.getMF();
2260   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2261 
2262   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2263     if (!MFI->isModuleEntryFunction()) {
2264       const Function &Fn = MF.getFunction();
2265       DiagnosticInfoUnsupported BadLDSDecl(
2266         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2267         DS_Warning);
2268       Fn.getContext().diagnose(BadLDSDecl);
2269 
2270       // We currently don't have a way to correctly allocate LDS objects that
2271       // aren't directly associated with a kernel. We do force inlining of
2272       // functions that use local objects. However, if these dead functions are
2273       // not eliminated, we don't want a compile time error. Just emit a warning
2274       // and a trap, since there should be no callable path here.
2275       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2276       B.buildUndef(DstReg);
2277       MI.eraseFromParent();
2278       return true;
2279     }
2280 
2281     // TODO: We could emit code to handle the initialization somewhere.
2282     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2283       const SITargetLowering *TLI = ST.getTargetLowering();
2284       if (!TLI->shouldUseLDSConstAddress(GV)) {
2285         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2286         return true; // Leave in place;
2287       }
2288 
2289       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2290         Type *Ty = GV->getValueType();
2291         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2292         // zero-sized type in other languages to declare the dynamic shared
2293         // memory which size is not known at the compile time. They will be
2294         // allocated by the runtime and placed directly after the static
2295         // allocated ones. They all share the same offset.
2296         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2297           // Adjust alignment for that dynamic shared memory array.
2298           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2299           LLT S32 = LLT::scalar(32);
2300           auto Sz =
2301               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2302           B.buildIntToPtr(DstReg, Sz);
2303           MI.eraseFromParent();
2304           return true;
2305         }
2306       }
2307 
2308       B.buildConstant(
2309           DstReg,
2310           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2311       MI.eraseFromParent();
2312       return true;
2313     }
2314 
2315     const Function &Fn = MF.getFunction();
2316     DiagnosticInfoUnsupported BadInit(
2317       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2318     Fn.getContext().diagnose(BadInit);
2319     return true;
2320   }
2321 
2322   const SITargetLowering *TLI = ST.getTargetLowering();
2323 
2324   if (TLI->shouldEmitFixup(GV)) {
2325     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2326     MI.eraseFromParent();
2327     return true;
2328   }
2329 
2330   if (TLI->shouldEmitPCReloc(GV)) {
2331     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2332     MI.eraseFromParent();
2333     return true;
2334   }
2335 
2336   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2337   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2338 
2339   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2340       MachinePointerInfo::getGOT(MF),
2341       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2342           MachineMemOperand::MOInvariant,
2343       8 /*Size*/, Align(8));
2344 
2345   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2346 
2347   if (Ty.getSizeInBits() == 32) {
2348     // Truncate if this is a 32-bit constant adrdess.
2349     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2350     B.buildExtract(DstReg, Load, 0);
2351   } else
2352     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2353 
2354   MI.eraseFromParent();
2355   return true;
2356 }
2357 
2358 static LLT widenToNextPowerOf2(LLT Ty) {
2359   if (Ty.isVector())
2360     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2361   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2362 }
2363 
2364 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2365                                        MachineInstr &MI) const {
2366   MachineIRBuilder &B = Helper.MIRBuilder;
2367   MachineRegisterInfo &MRI = *B.getMRI();
2368   GISelChangeObserver &Observer = Helper.Observer;
2369 
2370   Register PtrReg = MI.getOperand(1).getReg();
2371   LLT PtrTy = MRI.getType(PtrReg);
2372   unsigned AddrSpace = PtrTy.getAddressSpace();
2373 
2374   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2375     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2376     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2377     Observer.changingInstr(MI);
2378     MI.getOperand(1).setReg(Cast.getReg(0));
2379     Observer.changedInstr(MI);
2380     return true;
2381   }
2382 
2383   Register ValReg = MI.getOperand(0).getReg();
2384   LLT ValTy = MRI.getType(ValReg);
2385 
2386   MachineMemOperand *MMO = *MI.memoperands_begin();
2387   const unsigned ValSize = ValTy.getSizeInBits();
2388   const unsigned MemSize = 8 * MMO->getSize();
2389   const Align MemAlign = MMO->getAlign();
2390   const unsigned AlignInBits = 8 * MemAlign.value();
2391 
2392   // Widen non-power-of-2 loads to the alignment if needed
2393   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2394     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2395 
2396     // This was already the correct extending load result type, so just adjust
2397     // the memory type.
2398     if (WideMemSize == ValSize) {
2399       MachineFunction &MF = B.getMF();
2400 
2401       MachineMemOperand *WideMMO =
2402           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2403       Observer.changingInstr(MI);
2404       MI.setMemRefs(MF, {WideMMO});
2405       Observer.changedInstr(MI);
2406       return true;
2407     }
2408 
2409     // Don't bother handling edge case that should probably never be produced.
2410     if (ValSize > WideMemSize)
2411       return false;
2412 
2413     LLT WideTy = widenToNextPowerOf2(ValTy);
2414 
2415     Register WideLoad;
2416     if (!WideTy.isVector()) {
2417       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2418       B.buildTrunc(ValReg, WideLoad).getReg(0);
2419     } else {
2420       // Extract the subvector.
2421 
2422       if (isRegisterType(ValTy)) {
2423         // If this a case where G_EXTRACT is legal, use it.
2424         // (e.g. <3 x s32> -> <4 x s32>)
2425         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2426         B.buildExtract(ValReg, WideLoad, 0);
2427       } else {
2428         // For cases where the widened type isn't a nice register value, unmerge
2429         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2430         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2431         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2432         B.setInsertPt(B.getMBB(), MI.getIterator());
2433         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2434       }
2435     }
2436 
2437     MI.eraseFromParent();
2438     return true;
2439   }
2440 
2441   return false;
2442 }
2443 
2444 bool AMDGPULegalizerInfo::legalizeFMad(
2445   MachineInstr &MI, MachineRegisterInfo &MRI,
2446   MachineIRBuilder &B) const {
2447   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2448   assert(Ty.isScalar());
2449 
2450   MachineFunction &MF = B.getMF();
2451   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2452 
2453   // TODO: Always legal with future ftz flag.
2454   // FIXME: Do we need just output?
2455   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2456     return true;
2457   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2458     return true;
2459 
2460   MachineIRBuilder HelperBuilder(MI);
2461   GISelObserverWrapper DummyObserver;
2462   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2463   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2464 }
2465 
2466 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2467   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2468   Register DstReg = MI.getOperand(0).getReg();
2469   Register PtrReg = MI.getOperand(1).getReg();
2470   Register CmpVal = MI.getOperand(2).getReg();
2471   Register NewVal = MI.getOperand(3).getReg();
2472 
2473   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2474          "this should not have been custom lowered");
2475 
2476   LLT ValTy = MRI.getType(CmpVal);
2477   LLT VecTy = LLT::vector(2, ValTy);
2478 
2479   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2480 
2481   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2482     .addDef(DstReg)
2483     .addUse(PtrReg)
2484     .addUse(PackedVal)
2485     .setMemRefs(MI.memoperands());
2486 
2487   MI.eraseFromParent();
2488   return true;
2489 }
2490 
2491 bool AMDGPULegalizerInfo::legalizeFlog(
2492   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2493   Register Dst = MI.getOperand(0).getReg();
2494   Register Src = MI.getOperand(1).getReg();
2495   LLT Ty = B.getMRI()->getType(Dst);
2496   unsigned Flags = MI.getFlags();
2497 
2498   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2499   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2500 
2501   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2502   MI.eraseFromParent();
2503   return true;
2504 }
2505 
2506 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2507                                        MachineIRBuilder &B) const {
2508   Register Dst = MI.getOperand(0).getReg();
2509   Register Src = MI.getOperand(1).getReg();
2510   unsigned Flags = MI.getFlags();
2511   LLT Ty = B.getMRI()->getType(Dst);
2512 
2513   auto K = B.buildFConstant(Ty, numbers::log2e);
2514   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2515   B.buildFExp2(Dst, Mul, Flags);
2516   MI.eraseFromParent();
2517   return true;
2518 }
2519 
2520 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2521                                        MachineIRBuilder &B) const {
2522   Register Dst = MI.getOperand(0).getReg();
2523   Register Src0 = MI.getOperand(1).getReg();
2524   Register Src1 = MI.getOperand(2).getReg();
2525   unsigned Flags = MI.getFlags();
2526   LLT Ty = B.getMRI()->getType(Dst);
2527   const LLT S16 = LLT::scalar(16);
2528   const LLT S32 = LLT::scalar(32);
2529 
2530   if (Ty == S32) {
2531     auto Log = B.buildFLog2(S32, Src0, Flags);
2532     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2533       .addUse(Log.getReg(0))
2534       .addUse(Src1)
2535       .setMIFlags(Flags);
2536     B.buildFExp2(Dst, Mul, Flags);
2537   } else if (Ty == S16) {
2538     // There's no f16 fmul_legacy, so we need to convert for it.
2539     auto Log = B.buildFLog2(S16, Src0, Flags);
2540     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2541     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2542     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2543       .addUse(Ext0.getReg(0))
2544       .addUse(Ext1.getReg(0))
2545       .setMIFlags(Flags);
2546 
2547     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2548   } else
2549     return false;
2550 
2551   MI.eraseFromParent();
2552   return true;
2553 }
2554 
2555 // Find a source register, ignoring any possible source modifiers.
2556 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2557   Register ModSrc = OrigSrc;
2558   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2559     ModSrc = SrcFNeg->getOperand(1).getReg();
2560     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561       ModSrc = SrcFAbs->getOperand(1).getReg();
2562   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2563     ModSrc = SrcFAbs->getOperand(1).getReg();
2564   return ModSrc;
2565 }
2566 
2567 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2568                                          MachineRegisterInfo &MRI,
2569                                          MachineIRBuilder &B) const {
2570 
2571   const LLT S1 = LLT::scalar(1);
2572   const LLT S64 = LLT::scalar(64);
2573   Register Dst = MI.getOperand(0).getReg();
2574   Register OrigSrc = MI.getOperand(1).getReg();
2575   unsigned Flags = MI.getFlags();
2576   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2577          "this should not have been custom lowered");
2578 
2579   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2580   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2581   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2582   // V_FRACT bug is:
2583   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2584   //
2585   // Convert floor(x) to (x - fract(x))
2586 
2587   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2588     .addUse(OrigSrc)
2589     .setMIFlags(Flags);
2590 
2591   // Give source modifier matching some assistance before obscuring a foldable
2592   // pattern.
2593 
2594   // TODO: We can avoid the neg on the fract? The input sign to fract
2595   // shouldn't matter?
2596   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2597 
2598   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2599 
2600   Register Min = MRI.createGenericVirtualRegister(S64);
2601 
2602   // We don't need to concern ourselves with the snan handling difference, so
2603   // use the one which will directly select.
2604   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2605   if (MFI->getMode().IEEE)
2606     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2607   else
2608     B.buildFMinNum(Min, Fract, Const, Flags);
2609 
2610   Register CorrectedFract = Min;
2611   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2612     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2613     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2614   }
2615 
2616   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2617   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2618 
2619   MI.eraseFromParent();
2620   return true;
2621 }
2622 
2623 // Turn an illegal packed v2s16 build vector into bit operations.
2624 // TODO: This should probably be a bitcast action in LegalizerHelper.
2625 bool AMDGPULegalizerInfo::legalizeBuildVector(
2626   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2627   Register Dst = MI.getOperand(0).getReg();
2628   const LLT S32 = LLT::scalar(32);
2629   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2630 
2631   Register Src0 = MI.getOperand(1).getReg();
2632   Register Src1 = MI.getOperand(2).getReg();
2633   assert(MRI.getType(Src0) == LLT::scalar(16));
2634 
2635   auto Merge = B.buildMerge(S32, {Src0, Src1});
2636   B.buildBitcast(Dst, Merge);
2637 
2638   MI.eraseFromParent();
2639   return true;
2640 }
2641 
2642 // Check that this is a G_XOR x, -1
2643 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2644   if (MI.getOpcode() != TargetOpcode::G_XOR)
2645     return false;
2646   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2647   return ConstVal && *ConstVal == -1;
2648 }
2649 
2650 // Return the use branch instruction, otherwise null if the usage is invalid.
2651 static MachineInstr *
2652 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2653                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2654   Register CondDef = MI.getOperand(0).getReg();
2655   if (!MRI.hasOneNonDBGUse(CondDef))
2656     return nullptr;
2657 
2658   MachineBasicBlock *Parent = MI.getParent();
2659   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2660 
2661   if (isNot(MRI, *UseMI)) {
2662     Register NegatedCond = UseMI->getOperand(0).getReg();
2663     if (!MRI.hasOneNonDBGUse(NegatedCond))
2664       return nullptr;
2665 
2666     // We're deleting the def of this value, so we need to remove it.
2667     UseMI->eraseFromParent();
2668 
2669     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2670     Negated = true;
2671   }
2672 
2673   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2674     return nullptr;
2675 
2676   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2677   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2678   if (Next == Parent->end()) {
2679     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2680     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2681       return nullptr;
2682     UncondBrTarget = &*NextMBB;
2683   } else {
2684     if (Next->getOpcode() != AMDGPU::G_BR)
2685       return nullptr;
2686     Br = &*Next;
2687     UncondBrTarget = Br->getOperand(0).getMBB();
2688   }
2689 
2690   return UseMI;
2691 }
2692 
2693 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2694                                          const ArgDescriptor *Arg,
2695                                          const TargetRegisterClass *ArgRC,
2696                                          LLT ArgTy) const {
2697   MCRegister SrcReg = Arg->getRegister();
2698   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2699   assert(DstReg.isVirtual() && "Virtual register expected");
2700 
2701   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2702                                              ArgTy);
2703   if (Arg->isMasked()) {
2704     // TODO: Should we try to emit this once in the entry block?
2705     const LLT S32 = LLT::scalar(32);
2706     const unsigned Mask = Arg->getMask();
2707     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2708 
2709     Register AndMaskSrc = LiveIn;
2710 
2711     if (Shift != 0) {
2712       auto ShiftAmt = B.buildConstant(S32, Shift);
2713       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2714     }
2715 
2716     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2717   } else {
2718     B.buildCopy(DstReg, LiveIn);
2719   }
2720 
2721   return true;
2722 }
2723 
2724 bool AMDGPULegalizerInfo::loadInputValue(
2725     Register DstReg, MachineIRBuilder &B,
2726     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2727   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2728   const ArgDescriptor *Arg;
2729   const TargetRegisterClass *ArgRC;
2730   LLT ArgTy;
2731   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2732 
2733   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2734     return false; // TODO: Handle these
2735   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2736 }
2737 
2738 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2739     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2740     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2741   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2742     return false;
2743 
2744   MI.eraseFromParent();
2745   return true;
2746 }
2747 
2748 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2749                                        MachineRegisterInfo &MRI,
2750                                        MachineIRBuilder &B) const {
2751   Register Dst = MI.getOperand(0).getReg();
2752   LLT DstTy = MRI.getType(Dst);
2753   LLT S16 = LLT::scalar(16);
2754   LLT S32 = LLT::scalar(32);
2755   LLT S64 = LLT::scalar(64);
2756 
2757   if (DstTy == S16)
2758     return legalizeFDIV16(MI, MRI, B);
2759   if (DstTy == S32)
2760     return legalizeFDIV32(MI, MRI, B);
2761   if (DstTy == S64)
2762     return legalizeFDIV64(MI, MRI, B);
2763 
2764   return false;
2765 }
2766 
2767 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2768                                                   Register DstReg,
2769                                                   Register X,
2770                                                   Register Y,
2771                                                   bool IsDiv) const {
2772   const LLT S1 = LLT::scalar(1);
2773   const LLT S32 = LLT::scalar(32);
2774 
2775   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2776   // algorithm used here.
2777 
2778   // Initial estimate of inv(y).
2779   auto FloatY = B.buildUITOFP(S32, Y);
2780   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2781   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2782   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2783   auto Z = B.buildFPTOUI(S32, ScaledY);
2784 
2785   // One round of UNR.
2786   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2787   auto NegYZ = B.buildMul(S32, NegY, Z);
2788   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2789 
2790   // Quotient/remainder estimate.
2791   auto Q = B.buildUMulH(S32, X, Z);
2792   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2793 
2794   // First quotient/remainder refinement.
2795   auto One = B.buildConstant(S32, 1);
2796   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2797   if (IsDiv)
2798     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2799   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2800 
2801   // Second quotient/remainder refinement.
2802   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2803   if (IsDiv)
2804     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2805   else
2806     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2807 }
2808 
2809 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2810                                               MachineRegisterInfo &MRI,
2811                                               MachineIRBuilder &B) const {
2812   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2813   Register DstReg = MI.getOperand(0).getReg();
2814   Register Num = MI.getOperand(1).getReg();
2815   Register Den = MI.getOperand(2).getReg();
2816   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2817   MI.eraseFromParent();
2818   return true;
2819 }
2820 
2821 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2822 //
2823 // Return lo, hi of result
2824 //
2825 // %cvt.lo = G_UITOFP Val.lo
2826 // %cvt.hi = G_UITOFP Val.hi
2827 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2828 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2829 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2830 // %mul2 = G_FMUL %mul1, 2**(-32)
2831 // %trunc = G_INTRINSIC_TRUNC %mul2
2832 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2833 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2834 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2835                                                        Register Val) {
2836   const LLT S32 = LLT::scalar(32);
2837   auto Unmerge = B.buildUnmerge(S32, Val);
2838 
2839   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2840   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2841 
2842   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2843                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2844 
2845   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2846   auto Mul1 =
2847       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2848 
2849   // 2**(-32)
2850   auto Mul2 =
2851       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2852   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2853 
2854   // -(2**32)
2855   auto Mad2 = B.buildFMAD(S32, Trunc,
2856                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2857 
2858   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2859   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2860 
2861   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2862 }
2863 
2864 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2865                                                   Register DstReg,
2866                                                   Register Numer,
2867                                                   Register Denom,
2868                                                   bool IsDiv) const {
2869   const LLT S32 = LLT::scalar(32);
2870   const LLT S64 = LLT::scalar(64);
2871   const LLT S1 = LLT::scalar(1);
2872   Register RcpLo, RcpHi;
2873 
2874   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2875 
2876   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2877 
2878   auto Zero64 = B.buildConstant(S64, 0);
2879   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2880 
2881   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2882   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2883 
2884   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2885   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2886   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2887 
2888   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2889   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2890   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2891   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2892 
2893   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2894   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2895   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2896   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2897   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2898 
2899   auto Zero32 = B.buildConstant(S32, 0);
2900   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2901   auto Add2_HiC =
2902       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2903   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2904   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2905 
2906   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2907   Register NumerLo = UnmergeNumer.getReg(0);
2908   Register NumerHi = UnmergeNumer.getReg(1);
2909 
2910   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2911   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2912   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2913   Register Mul3_Lo = UnmergeMul3.getReg(0);
2914   Register Mul3_Hi = UnmergeMul3.getReg(1);
2915   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2916   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2917   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2918   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2919 
2920   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2921   Register DenomLo = UnmergeDenom.getReg(0);
2922   Register DenomHi = UnmergeDenom.getReg(1);
2923 
2924   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2925   auto C1 = B.buildSExt(S32, CmpHi);
2926 
2927   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2928   auto C2 = B.buildSExt(S32, CmpLo);
2929 
2930   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2931   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2932 
2933   // TODO: Here and below portions of the code can be enclosed into if/endif.
2934   // Currently control flow is unconditional and we have 4 selects after
2935   // potential endif to substitute PHIs.
2936 
2937   // if C3 != 0 ...
2938   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2939   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2940   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2941   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2942 
2943   auto One64 = B.buildConstant(S64, 1);
2944   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2945 
2946   auto C4 =
2947       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2948   auto C5 =
2949       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2950   auto C6 = B.buildSelect(
2951       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2952 
2953   // if (C6 != 0)
2954   auto Add4 = B.buildAdd(S64, Add3, One64);
2955   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2956 
2957   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2958   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2959   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2960 
2961   // endif C6
2962   // endif C3
2963 
2964   if (IsDiv) {
2965     auto Sel1 = B.buildSelect(
2966         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2967     B.buildSelect(DstReg,
2968                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2969   } else {
2970     auto Sel2 = B.buildSelect(
2971         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2972     B.buildSelect(DstReg,
2973                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2974   }
2975 }
2976 
2977 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2978                                             MachineRegisterInfo &MRI,
2979                                             MachineIRBuilder &B) const {
2980   const LLT S64 = LLT::scalar(64);
2981   const LLT S32 = LLT::scalar(32);
2982   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2983   Register DstReg = MI.getOperand(0).getReg();
2984   Register Num = MI.getOperand(1).getReg();
2985   Register Den = MI.getOperand(2).getReg();
2986   LLT Ty = MRI.getType(DstReg);
2987 
2988   if (Ty == S32)
2989     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2990   else if (Ty == S64)
2991     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2992   else
2993     return false;
2994 
2995   MI.eraseFromParent();
2996   return true;
2997 
2998 }
2999 
3000 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3001                                             MachineRegisterInfo &MRI,
3002                                             MachineIRBuilder &B) const {
3003   const LLT S64 = LLT::scalar(64);
3004   const LLT S32 = LLT::scalar(32);
3005 
3006   Register DstReg = MI.getOperand(0).getReg();
3007   const LLT Ty = MRI.getType(DstReg);
3008   if (Ty != S32 && Ty != S64)
3009     return false;
3010 
3011   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3012 
3013   Register LHS = MI.getOperand(1).getReg();
3014   Register RHS = MI.getOperand(2).getReg();
3015 
3016   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3017   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3018   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3019 
3020   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3021   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3022 
3023   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3024   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3025 
3026   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3027   if (Ty == S32)
3028     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3029   else
3030     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3031 
3032   Register Sign;
3033   if (IsDiv)
3034     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3035   else
3036     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3037 
3038   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3039   B.buildSub(DstReg, UDivRem, Sign);
3040 
3041   MI.eraseFromParent();
3042   return true;
3043 }
3044 
3045 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3046                                                  MachineRegisterInfo &MRI,
3047                                                  MachineIRBuilder &B) const {
3048   Register Res = MI.getOperand(0).getReg();
3049   Register LHS = MI.getOperand(1).getReg();
3050   Register RHS = MI.getOperand(2).getReg();
3051   uint16_t Flags = MI.getFlags();
3052   LLT ResTy = MRI.getType(Res);
3053 
3054   const MachineFunction &MF = B.getMF();
3055   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3056                             MI.getFlag(MachineInstr::FmAfn);
3057 
3058   if (!AllowInaccurateRcp)
3059     return false;
3060 
3061   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3062     // 1 / x -> RCP(x)
3063     if (CLHS->isExactlyValue(1.0)) {
3064       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3065         .addUse(RHS)
3066         .setMIFlags(Flags);
3067 
3068       MI.eraseFromParent();
3069       return true;
3070     }
3071 
3072     // -1 / x -> RCP( FNEG(x) )
3073     if (CLHS->isExactlyValue(-1.0)) {
3074       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3075       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3076         .addUse(FNeg.getReg(0))
3077         .setMIFlags(Flags);
3078 
3079       MI.eraseFromParent();
3080       return true;
3081     }
3082   }
3083 
3084   // x / y -> x * (1.0 / y)
3085   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3086     .addUse(RHS)
3087     .setMIFlags(Flags);
3088   B.buildFMul(Res, LHS, RCP, Flags);
3089 
3090   MI.eraseFromParent();
3091   return true;
3092 }
3093 
3094 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3095                                                    MachineRegisterInfo &MRI,
3096                                                    MachineIRBuilder &B) const {
3097   Register Res = MI.getOperand(0).getReg();
3098   Register X = MI.getOperand(1).getReg();
3099   Register Y = MI.getOperand(2).getReg();
3100   uint16_t Flags = MI.getFlags();
3101   LLT ResTy = MRI.getType(Res);
3102 
3103   const MachineFunction &MF = B.getMF();
3104   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3105                             MI.getFlag(MachineInstr::FmAfn);
3106 
3107   if (!AllowInaccurateRcp)
3108     return false;
3109 
3110   auto NegY = B.buildFNeg(ResTy, Y);
3111   auto One = B.buildFConstant(ResTy, 1.0);
3112 
3113   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3114     .addUse(Y)
3115     .setMIFlags(Flags);
3116 
3117   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3118   R = B.buildFMA(ResTy, Tmp0, R, R);
3119 
3120   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3121   R = B.buildFMA(ResTy, Tmp1, R, R);
3122 
3123   auto Ret = B.buildFMul(ResTy, X, R);
3124   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3125 
3126   B.buildFMA(Res, Tmp2, R, Ret);
3127   MI.eraseFromParent();
3128   return true;
3129 }
3130 
3131 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3132                                          MachineRegisterInfo &MRI,
3133                                          MachineIRBuilder &B) const {
3134   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3135     return true;
3136 
3137   Register Res = MI.getOperand(0).getReg();
3138   Register LHS = MI.getOperand(1).getReg();
3139   Register RHS = MI.getOperand(2).getReg();
3140 
3141   uint16_t Flags = MI.getFlags();
3142 
3143   LLT S16 = LLT::scalar(16);
3144   LLT S32 = LLT::scalar(32);
3145 
3146   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3147   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3148 
3149   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3150     .addUse(RHSExt.getReg(0))
3151     .setMIFlags(Flags);
3152 
3153   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3154   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3155 
3156   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3157     .addUse(RDst.getReg(0))
3158     .addUse(RHS)
3159     .addUse(LHS)
3160     .setMIFlags(Flags);
3161 
3162   MI.eraseFromParent();
3163   return true;
3164 }
3165 
3166 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3167 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3168 static void toggleSPDenormMode(bool Enable,
3169                                MachineIRBuilder &B,
3170                                const GCNSubtarget &ST,
3171                                AMDGPU::SIModeRegisterDefaults Mode) {
3172   // Set SP denorm mode to this value.
3173   unsigned SPDenormMode =
3174     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3175 
3176   if (ST.hasDenormModeInst()) {
3177     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3178     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3179 
3180     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3181     B.buildInstr(AMDGPU::S_DENORM_MODE)
3182       .addImm(NewDenormModeValue);
3183 
3184   } else {
3185     // Select FP32 bit field in mode register.
3186     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3187                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3188                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3189 
3190     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3191       .addImm(SPDenormMode)
3192       .addImm(SPDenormModeBitField);
3193   }
3194 }
3195 
3196 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3197                                          MachineRegisterInfo &MRI,
3198                                          MachineIRBuilder &B) const {
3199   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3200     return true;
3201 
3202   Register Res = MI.getOperand(0).getReg();
3203   Register LHS = MI.getOperand(1).getReg();
3204   Register RHS = MI.getOperand(2).getReg();
3205   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3206   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3207 
3208   uint16_t Flags = MI.getFlags();
3209 
3210   LLT S32 = LLT::scalar(32);
3211   LLT S1 = LLT::scalar(1);
3212 
3213   auto One = B.buildFConstant(S32, 1.0f);
3214 
3215   auto DenominatorScaled =
3216     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3217       .addUse(LHS)
3218       .addUse(RHS)
3219       .addImm(0)
3220       .setMIFlags(Flags);
3221   auto NumeratorScaled =
3222     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3223       .addUse(LHS)
3224       .addUse(RHS)
3225       .addImm(1)
3226       .setMIFlags(Flags);
3227 
3228   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3229     .addUse(DenominatorScaled.getReg(0))
3230     .setMIFlags(Flags);
3231   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3232 
3233   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3234   // aren't modeled as reading it.
3235   if (!Mode.allFP32Denormals())
3236     toggleSPDenormMode(true, B, ST, Mode);
3237 
3238   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3239   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3240   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3241   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3242   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3243   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3244 
3245   if (!Mode.allFP32Denormals())
3246     toggleSPDenormMode(false, B, ST, Mode);
3247 
3248   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3249     .addUse(Fma4.getReg(0))
3250     .addUse(Fma1.getReg(0))
3251     .addUse(Fma3.getReg(0))
3252     .addUse(NumeratorScaled.getReg(1))
3253     .setMIFlags(Flags);
3254 
3255   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3256     .addUse(Fmas.getReg(0))
3257     .addUse(RHS)
3258     .addUse(LHS)
3259     .setMIFlags(Flags);
3260 
3261   MI.eraseFromParent();
3262   return true;
3263 }
3264 
3265 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3266                                          MachineRegisterInfo &MRI,
3267                                          MachineIRBuilder &B) const {
3268   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3269     return true;
3270 
3271   Register Res = MI.getOperand(0).getReg();
3272   Register LHS = MI.getOperand(1).getReg();
3273   Register RHS = MI.getOperand(2).getReg();
3274 
3275   uint16_t Flags = MI.getFlags();
3276 
3277   LLT S64 = LLT::scalar(64);
3278   LLT S1 = LLT::scalar(1);
3279 
3280   auto One = B.buildFConstant(S64, 1.0);
3281 
3282   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3283     .addUse(LHS)
3284     .addUse(RHS)
3285     .addImm(0)
3286     .setMIFlags(Flags);
3287 
3288   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3289 
3290   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3291     .addUse(DivScale0.getReg(0))
3292     .setMIFlags(Flags);
3293 
3294   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3295   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3296   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3297 
3298   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3299     .addUse(LHS)
3300     .addUse(RHS)
3301     .addImm(1)
3302     .setMIFlags(Flags);
3303 
3304   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3305   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3306   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3307 
3308   Register Scale;
3309   if (!ST.hasUsableDivScaleConditionOutput()) {
3310     // Workaround a hardware bug on SI where the condition output from div_scale
3311     // is not usable.
3312 
3313     LLT S32 = LLT::scalar(32);
3314 
3315     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3316     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3317     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3318     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3319 
3320     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3321                               Scale1Unmerge.getReg(1));
3322     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3323                               Scale0Unmerge.getReg(1));
3324     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3325   } else {
3326     Scale = DivScale1.getReg(1);
3327   }
3328 
3329   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3330     .addUse(Fma4.getReg(0))
3331     .addUse(Fma3.getReg(0))
3332     .addUse(Mul.getReg(0))
3333     .addUse(Scale)
3334     .setMIFlags(Flags);
3335 
3336   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3337     .addUse(Fmas.getReg(0))
3338     .addUse(RHS)
3339     .addUse(LHS)
3340     .setMIFlags(Flags);
3341 
3342   MI.eraseFromParent();
3343   return true;
3344 }
3345 
3346 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3347                                                  MachineRegisterInfo &MRI,
3348                                                  MachineIRBuilder &B) const {
3349   Register Res = MI.getOperand(0).getReg();
3350   Register LHS = MI.getOperand(2).getReg();
3351   Register RHS = MI.getOperand(3).getReg();
3352   uint16_t Flags = MI.getFlags();
3353 
3354   LLT S32 = LLT::scalar(32);
3355   LLT S1 = LLT::scalar(1);
3356 
3357   auto Abs = B.buildFAbs(S32, RHS, Flags);
3358   const APFloat C0Val(1.0f);
3359 
3360   auto C0 = B.buildConstant(S32, 0x6f800000);
3361   auto C1 = B.buildConstant(S32, 0x2f800000);
3362   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3363 
3364   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3365   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3366 
3367   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3368 
3369   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3370     .addUse(Mul0.getReg(0))
3371     .setMIFlags(Flags);
3372 
3373   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3374 
3375   B.buildFMul(Res, Sel, Mul1, Flags);
3376 
3377   MI.eraseFromParent();
3378   return true;
3379 }
3380 
3381 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3382 // FIXME: Why do we handle this one but not other removed instructions?
3383 //
3384 // Reciprocal square root.  The clamp prevents infinite results, clamping
3385 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3386 // +-max_float.
3387 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3388                                                     MachineRegisterInfo &MRI,
3389                                                     MachineIRBuilder &B) const {
3390   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3391     return true;
3392 
3393   Register Dst = MI.getOperand(0).getReg();
3394   Register Src = MI.getOperand(2).getReg();
3395   auto Flags = MI.getFlags();
3396 
3397   LLT Ty = MRI.getType(Dst);
3398 
3399   const fltSemantics *FltSemantics;
3400   if (Ty == LLT::scalar(32))
3401     FltSemantics = &APFloat::IEEEsingle();
3402   else if (Ty == LLT::scalar(64))
3403     FltSemantics = &APFloat::IEEEdouble();
3404   else
3405     return false;
3406 
3407   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3408     .addUse(Src)
3409     .setMIFlags(Flags);
3410 
3411   // We don't need to concern ourselves with the snan handling difference, since
3412   // the rsq quieted (or not) so use the one which will directly select.
3413   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3414   const bool UseIEEE = MFI->getMode().IEEE;
3415 
3416   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3417   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3418                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3419 
3420   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3421 
3422   if (UseIEEE)
3423     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3424   else
3425     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3426   MI.eraseFromParent();
3427   return true;
3428 }
3429 
3430 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3431   switch (IID) {
3432   case Intrinsic::amdgcn_ds_fadd:
3433     return AMDGPU::G_ATOMICRMW_FADD;
3434   case Intrinsic::amdgcn_ds_fmin:
3435     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3436   case Intrinsic::amdgcn_ds_fmax:
3437     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3438   default:
3439     llvm_unreachable("not a DS FP intrinsic");
3440   }
3441 }
3442 
3443 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3444                                                       MachineInstr &MI,
3445                                                       Intrinsic::ID IID) const {
3446   GISelChangeObserver &Observer = Helper.Observer;
3447   Observer.changingInstr(MI);
3448 
3449   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3450 
3451   // The remaining operands were used to set fields in the MemOperand on
3452   // construction.
3453   for (int I = 6; I > 3; --I)
3454     MI.RemoveOperand(I);
3455 
3456   MI.RemoveOperand(1); // Remove the intrinsic ID.
3457   Observer.changedInstr(MI);
3458   return true;
3459 }
3460 
3461 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3462                                             MachineRegisterInfo &MRI,
3463                                             MachineIRBuilder &B) const {
3464   uint64_t Offset =
3465     ST.getTargetLowering()->getImplicitParameterOffset(
3466       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3467   LLT DstTy = MRI.getType(DstReg);
3468   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3469 
3470   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3471   if (!loadInputValue(KernargPtrReg, B,
3472                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3473     return false;
3474 
3475   // FIXME: This should be nuw
3476   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3477   return true;
3478 }
3479 
3480 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3481                                                  MachineRegisterInfo &MRI,
3482                                                  MachineIRBuilder &B) const {
3483   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3484   if (!MFI->isEntryFunction()) {
3485     return legalizePreloadedArgIntrin(MI, MRI, B,
3486                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3487   }
3488 
3489   Register DstReg = MI.getOperand(0).getReg();
3490   if (!getImplicitArgPtr(DstReg, MRI, B))
3491     return false;
3492 
3493   MI.eraseFromParent();
3494   return true;
3495 }
3496 
3497 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3498                                               MachineRegisterInfo &MRI,
3499                                               MachineIRBuilder &B,
3500                                               unsigned AddrSpace) const {
3501   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3502   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3503   Register Hi32 = Unmerge.getReg(1);
3504 
3505   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3506   MI.eraseFromParent();
3507   return true;
3508 }
3509 
3510 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3511 // offset (the offset that is included in bounds checking and swizzling, to be
3512 // split between the instruction's voffset and immoffset fields) and soffset
3513 // (the offset that is excluded from bounds checking and swizzling, to go in
3514 // the instruction's soffset field).  This function takes the first kind of
3515 // offset and figures out how to split it between voffset and immoffset.
3516 std::tuple<Register, unsigned, unsigned>
3517 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3518                                         Register OrigOffset) const {
3519   const unsigned MaxImm = 4095;
3520   Register BaseReg;
3521   unsigned TotalConstOffset;
3522   const LLT S32 = LLT::scalar(32);
3523   MachineRegisterInfo &MRI = *B.getMRI();
3524 
3525   std::tie(BaseReg, TotalConstOffset) =
3526       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3527 
3528   unsigned ImmOffset = TotalConstOffset;
3529 
3530   // If BaseReg is a pointer, convert it to int.
3531   if (MRI.getType(BaseReg).isPointer())
3532     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3533 
3534   // If the immediate value is too big for the immoffset field, put the value
3535   // and -4096 into the immoffset field so that the value that is copied/added
3536   // for the voffset field is a multiple of 4096, and it stands more chance
3537   // of being CSEd with the copy/add for another similar load/store.
3538   // However, do not do that rounding down to a multiple of 4096 if that is a
3539   // negative number, as it appears to be illegal to have a negative offset
3540   // in the vgpr, even if adding the immediate offset makes it positive.
3541   unsigned Overflow = ImmOffset & ~MaxImm;
3542   ImmOffset -= Overflow;
3543   if ((int32_t)Overflow < 0) {
3544     Overflow += ImmOffset;
3545     ImmOffset = 0;
3546   }
3547 
3548   if (Overflow != 0) {
3549     if (!BaseReg) {
3550       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3551     } else {
3552       auto OverflowVal = B.buildConstant(S32, Overflow);
3553       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3554     }
3555   }
3556 
3557   if (!BaseReg)
3558     BaseReg = B.buildConstant(S32, 0).getReg(0);
3559 
3560   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3561 }
3562 
3563 /// Handle register layout difference for f16 images for some subtargets.
3564 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3565                                              MachineRegisterInfo &MRI,
3566                                              Register Reg,
3567                                              bool ImageStore) const {
3568   const LLT S16 = LLT::scalar(16);
3569   const LLT S32 = LLT::scalar(32);
3570   LLT StoreVT = MRI.getType(Reg);
3571   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3572 
3573   if (ST.hasUnpackedD16VMem()) {
3574     auto Unmerge = B.buildUnmerge(S16, Reg);
3575 
3576     SmallVector<Register, 4> WideRegs;
3577     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3578       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3579 
3580     int NumElts = StoreVT.getNumElements();
3581 
3582     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3583   }
3584 
3585   if (ImageStore && ST.hasImageStoreD16Bug()) {
3586     if (StoreVT.getNumElements() == 2) {
3587       SmallVector<Register, 4> PackedRegs;
3588       Reg = B.buildBitcast(S32, Reg).getReg(0);
3589       PackedRegs.push_back(Reg);
3590       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3591       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3592     }
3593 
3594     if (StoreVT.getNumElements() == 3) {
3595       SmallVector<Register, 4> PackedRegs;
3596       auto Unmerge = B.buildUnmerge(S16, Reg);
3597       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3598         PackedRegs.push_back(Unmerge.getReg(I));
3599       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3600       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3601       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3602     }
3603 
3604     if (StoreVT.getNumElements() == 4) {
3605       SmallVector<Register, 4> PackedRegs;
3606       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3607       auto Unmerge = B.buildUnmerge(S32, Reg);
3608       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3609         PackedRegs.push_back(Unmerge.getReg(I));
3610       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3611       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3612     }
3613 
3614     llvm_unreachable("invalid data type");
3615   }
3616 
3617   return Reg;
3618 }
3619 
3620 Register AMDGPULegalizerInfo::fixStoreSourceType(
3621   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3622   MachineRegisterInfo *MRI = B.getMRI();
3623   LLT Ty = MRI->getType(VData);
3624 
3625   const LLT S16 = LLT::scalar(16);
3626 
3627   // Fixup illegal register types for i8 stores.
3628   if (Ty == LLT::scalar(8) || Ty == S16) {
3629     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3630     return AnyExt;
3631   }
3632 
3633   if (Ty.isVector()) {
3634     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3635       if (IsFormat)
3636         return handleD16VData(B, *MRI, VData);
3637     }
3638   }
3639 
3640   return VData;
3641 }
3642 
3643 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3644                                               MachineRegisterInfo &MRI,
3645                                               MachineIRBuilder &B,
3646                                               bool IsTyped,
3647                                               bool IsFormat) const {
3648   Register VData = MI.getOperand(1).getReg();
3649   LLT Ty = MRI.getType(VData);
3650   LLT EltTy = Ty.getScalarType();
3651   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3652   const LLT S32 = LLT::scalar(32);
3653 
3654   VData = fixStoreSourceType(B, VData, IsFormat);
3655   Register RSrc = MI.getOperand(2).getReg();
3656 
3657   MachineMemOperand *MMO = *MI.memoperands_begin();
3658   const int MemSize = MMO->getSize();
3659 
3660   unsigned ImmOffset;
3661   unsigned TotalOffset;
3662 
3663   // The typed intrinsics add an immediate after the registers.
3664   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3665 
3666   // The struct intrinsic variants add one additional operand over raw.
3667   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3668   Register VIndex;
3669   int OpOffset = 0;
3670   if (HasVIndex) {
3671     VIndex = MI.getOperand(3).getReg();
3672     OpOffset = 1;
3673   }
3674 
3675   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3676   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3677 
3678   unsigned Format = 0;
3679   if (IsTyped) {
3680     Format = MI.getOperand(5 + OpOffset).getImm();
3681     ++OpOffset;
3682   }
3683 
3684   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3685 
3686   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3687   if (TotalOffset != 0)
3688     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3689 
3690   unsigned Opc;
3691   if (IsTyped) {
3692     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3693                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3694   } else if (IsFormat) {
3695     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3696                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3697   } else {
3698     switch (MemSize) {
3699     case 1:
3700       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3701       break;
3702     case 2:
3703       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3704       break;
3705     default:
3706       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3707       break;
3708     }
3709   }
3710 
3711   if (!VIndex)
3712     VIndex = B.buildConstant(S32, 0).getReg(0);
3713 
3714   auto MIB = B.buildInstr(Opc)
3715     .addUse(VData)              // vdata
3716     .addUse(RSrc)               // rsrc
3717     .addUse(VIndex)             // vindex
3718     .addUse(VOffset)            // voffset
3719     .addUse(SOffset)            // soffset
3720     .addImm(ImmOffset);         // offset(imm)
3721 
3722   if (IsTyped)
3723     MIB.addImm(Format);
3724 
3725   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3726      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3727      .addMemOperand(MMO);
3728 
3729   MI.eraseFromParent();
3730   return true;
3731 }
3732 
3733 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3734                                              MachineRegisterInfo &MRI,
3735                                              MachineIRBuilder &B,
3736                                              bool IsFormat,
3737                                              bool IsTyped) const {
3738   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3739   MachineMemOperand *MMO = *MI.memoperands_begin();
3740   const int MemSize = MMO->getSize();
3741   const LLT S32 = LLT::scalar(32);
3742 
3743   Register Dst = MI.getOperand(0).getReg();
3744   Register RSrc = MI.getOperand(2).getReg();
3745 
3746   // The typed intrinsics add an immediate after the registers.
3747   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3748 
3749   // The struct intrinsic variants add one additional operand over raw.
3750   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3751   Register VIndex;
3752   int OpOffset = 0;
3753   if (HasVIndex) {
3754     VIndex = MI.getOperand(3).getReg();
3755     OpOffset = 1;
3756   }
3757 
3758   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3759   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3760 
3761   unsigned Format = 0;
3762   if (IsTyped) {
3763     Format = MI.getOperand(5 + OpOffset).getImm();
3764     ++OpOffset;
3765   }
3766 
3767   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3768   unsigned ImmOffset;
3769   unsigned TotalOffset;
3770 
3771   LLT Ty = MRI.getType(Dst);
3772   LLT EltTy = Ty.getScalarType();
3773   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3774   const bool Unpacked = ST.hasUnpackedD16VMem();
3775 
3776   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3777   if (TotalOffset != 0)
3778     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3779 
3780   unsigned Opc;
3781 
3782   if (IsTyped) {
3783     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3784                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3785   } else if (IsFormat) {
3786     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3787                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3788   } else {
3789     switch (MemSize) {
3790     case 1:
3791       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3792       break;
3793     case 2:
3794       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3795       break;
3796     default:
3797       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3798       break;
3799     }
3800   }
3801 
3802   Register LoadDstReg;
3803 
3804   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3805   LLT UnpackedTy = Ty.changeElementSize(32);
3806 
3807   if (IsExtLoad)
3808     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3809   else if (Unpacked && IsD16 && Ty.isVector())
3810     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3811   else
3812     LoadDstReg = Dst;
3813 
3814   if (!VIndex)
3815     VIndex = B.buildConstant(S32, 0).getReg(0);
3816 
3817   auto MIB = B.buildInstr(Opc)
3818     .addDef(LoadDstReg)         // vdata
3819     .addUse(RSrc)               // rsrc
3820     .addUse(VIndex)             // vindex
3821     .addUse(VOffset)            // voffset
3822     .addUse(SOffset)            // soffset
3823     .addImm(ImmOffset);         // offset(imm)
3824 
3825   if (IsTyped)
3826     MIB.addImm(Format);
3827 
3828   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3829      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3830      .addMemOperand(MMO);
3831 
3832   if (LoadDstReg != Dst) {
3833     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3834 
3835     // Widen result for extending loads was widened.
3836     if (IsExtLoad)
3837       B.buildTrunc(Dst, LoadDstReg);
3838     else {
3839       // Repack to original 16-bit vector result
3840       // FIXME: G_TRUNC should work, but legalization currently fails
3841       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3842       SmallVector<Register, 4> Repack;
3843       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3844         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3845       B.buildMerge(Dst, Repack);
3846     }
3847   }
3848 
3849   MI.eraseFromParent();
3850   return true;
3851 }
3852 
3853 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3854                                                MachineIRBuilder &B,
3855                                                bool IsInc) const {
3856   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3857                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3858   B.buildInstr(Opc)
3859     .addDef(MI.getOperand(0).getReg())
3860     .addUse(MI.getOperand(2).getReg())
3861     .addUse(MI.getOperand(3).getReg())
3862     .cloneMemRefs(MI);
3863   MI.eraseFromParent();
3864   return true;
3865 }
3866 
3867 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3868   switch (IntrID) {
3869   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3870   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3871     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3872   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3873   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3874     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3875   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3876   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3877     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3878   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3879   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3880     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3881   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3882   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3883     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3884   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3885   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3886     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3887   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3888   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3889     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3890   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3891   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3892     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3893   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3894   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3896   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3897   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3898     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3899   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3900   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3901     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3902   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3903   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3904     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3905   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3906   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3907     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3908   case Intrinsic::amdgcn_buffer_atomic_fadd:
3909   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3910   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3911     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3912   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3913   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3914     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3915   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3916   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3917     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3918   default:
3919     llvm_unreachable("unhandled atomic opcode");
3920   }
3921 }
3922 
3923 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3924                                                MachineIRBuilder &B,
3925                                                Intrinsic::ID IID) const {
3926   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3927                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3928   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3929 
3930   Register Dst;
3931 
3932   int OpOffset = 0;
3933   if (HasReturn) {
3934     // A few FP atomics do not support return values.
3935     Dst = MI.getOperand(0).getReg();
3936   } else {
3937     OpOffset = -1;
3938   }
3939 
3940   Register VData = MI.getOperand(2 + OpOffset).getReg();
3941   Register CmpVal;
3942 
3943   if (IsCmpSwap) {
3944     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3945     ++OpOffset;
3946   }
3947 
3948   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3949   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3950 
3951   // The struct intrinsic variants add one additional operand over raw.
3952   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3953   Register VIndex;
3954   if (HasVIndex) {
3955     VIndex = MI.getOperand(4 + OpOffset).getReg();
3956     ++OpOffset;
3957   }
3958 
3959   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3960   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3961   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3962 
3963   MachineMemOperand *MMO = *MI.memoperands_begin();
3964 
3965   unsigned ImmOffset;
3966   unsigned TotalOffset;
3967   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3968   if (TotalOffset != 0)
3969     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3970 
3971   if (!VIndex)
3972     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3973 
3974   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3975 
3976   if (HasReturn)
3977     MIB.addDef(Dst);
3978 
3979   MIB.addUse(VData); // vdata
3980 
3981   if (IsCmpSwap)
3982     MIB.addReg(CmpVal);
3983 
3984   MIB.addUse(RSrc)               // rsrc
3985      .addUse(VIndex)             // vindex
3986      .addUse(VOffset)            // voffset
3987      .addUse(SOffset)            // soffset
3988      .addImm(ImmOffset)          // offset(imm)
3989      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3990      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3991      .addMemOperand(MMO);
3992 
3993   MI.eraseFromParent();
3994   return true;
3995 }
3996 
3997 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3998 /// vector with s16 typed elements.
3999 static void packImageA16AddressToDwords(
4000     MachineIRBuilder &B, MachineInstr &MI,
4001     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4002     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4003   const LLT S16 = LLT::scalar(16);
4004   const LLT V2S16 = LLT::vector(2, 16);
4005 
4006   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4007     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4008     if (!SrcOp.isReg())
4009       continue; // _L to _LZ may have eliminated this.
4010 
4011     Register AddrReg = SrcOp.getReg();
4012 
4013     if (I < Intr->GradientStart) {
4014       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4015       PackedAddrs.push_back(AddrReg);
4016     } else {
4017       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4018       // derivatives dx/dh and dx/dv are packed with undef.
4019       if (((I + 1) >= EndIdx) ||
4020           ((Intr->NumGradients / 2) % 2 == 1 &&
4021            (I == static_cast<unsigned>(Intr->GradientStart +
4022                                        (Intr->NumGradients / 2) - 1) ||
4023             I == static_cast<unsigned>(Intr->GradientStart +
4024                                        Intr->NumGradients - 1))) ||
4025           // Check for _L to _LZ optimization
4026           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4027         PackedAddrs.push_back(
4028             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4029                 .getReg(0));
4030       } else {
4031         PackedAddrs.push_back(
4032             B.buildBuildVector(
4033                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4034                 .getReg(0));
4035         ++I;
4036       }
4037     }
4038   }
4039 }
4040 
4041 /// Convert from separate vaddr components to a single vector address register,
4042 /// and replace the remaining operands with $noreg.
4043 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4044                                      int DimIdx, int NumVAddrs) {
4045   const LLT S32 = LLT::scalar(32);
4046 
4047   SmallVector<Register, 8> AddrRegs;
4048   for (int I = 0; I != NumVAddrs; ++I) {
4049     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4050     if (SrcOp.isReg()) {
4051       AddrRegs.push_back(SrcOp.getReg());
4052       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4053     }
4054   }
4055 
4056   int NumAddrRegs = AddrRegs.size();
4057   if (NumAddrRegs != 1) {
4058     // Round up to 8 elements for v5-v7
4059     // FIXME: Missing intermediate sized register classes and instructions.
4060     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4061       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4062       auto Undef = B.buildUndef(S32);
4063       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4064       NumAddrRegs = RoundedNumRegs;
4065     }
4066 
4067     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4068     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4069   }
4070 
4071   for (int I = 1; I != NumVAddrs; ++I) {
4072     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4073     if (SrcOp.isReg())
4074       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4075   }
4076 }
4077 
4078 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4079 ///
4080 /// Depending on the subtarget, load/store with 16-bit element data need to be
4081 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4082 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4083 /// registers.
4084 ///
4085 /// We don't want to directly select image instructions just yet, but also want
4086 /// to exposes all register repacking to the legalizer/combiners. We also don't
4087 /// want a selected instrution entering RegBankSelect. In order to avoid
4088 /// defining a multitude of intermediate image instructions, directly hack on
4089 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4090 /// now unnecessary arguments with $noreg.
4091 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4092     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4093     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4094 
4095   const unsigned NumDefs = MI.getNumExplicitDefs();
4096   const unsigned ArgOffset = NumDefs + 1;
4097   bool IsTFE = NumDefs == 2;
4098   // We are only processing the operands of d16 image operations on subtargets
4099   // that use the unpacked register layout, or need to repack the TFE result.
4100 
4101   // TODO: Do we need to guard against already legalized intrinsics?
4102   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4103       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4104 
4105   MachineRegisterInfo *MRI = B.getMRI();
4106   const LLT S32 = LLT::scalar(32);
4107   const LLT S16 = LLT::scalar(16);
4108   const LLT V2S16 = LLT::vector(2, 16);
4109 
4110   unsigned DMask = 0;
4111 
4112   // Check for 16 bit addresses and pack if true.
4113   LLT GradTy =
4114       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4115   LLT AddrTy =
4116       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4117   const bool IsG16 = GradTy == S16;
4118   const bool IsA16 = AddrTy == S16;
4119 
4120   int DMaskLanes = 0;
4121   if (!BaseOpcode->Atomic) {
4122     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4123     if (BaseOpcode->Gather4) {
4124       DMaskLanes = 4;
4125     } else if (DMask != 0) {
4126       DMaskLanes = countPopulation(DMask);
4127     } else if (!IsTFE && !BaseOpcode->Store) {
4128       // If dmask is 0, this is a no-op load. This can be eliminated.
4129       B.buildUndef(MI.getOperand(0));
4130       MI.eraseFromParent();
4131       return true;
4132     }
4133   }
4134 
4135   Observer.changingInstr(MI);
4136   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4137 
4138   unsigned NewOpcode = NumDefs == 0 ?
4139     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4140 
4141   // Track that we legalized this
4142   MI.setDesc(B.getTII().get(NewOpcode));
4143 
4144   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4145   // dmask to be at least 1 otherwise the instruction will fail
4146   if (IsTFE && DMask == 0) {
4147     DMask = 0x1;
4148     DMaskLanes = 1;
4149     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4150   }
4151 
4152   if (BaseOpcode->Atomic) {
4153     Register VData0 = MI.getOperand(2).getReg();
4154     LLT Ty = MRI->getType(VData0);
4155 
4156     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4157     if (Ty.isVector())
4158       return false;
4159 
4160     if (BaseOpcode->AtomicX2) {
4161       Register VData1 = MI.getOperand(3).getReg();
4162       // The two values are packed in one register.
4163       LLT PackedTy = LLT::vector(2, Ty);
4164       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4165       MI.getOperand(2).setReg(Concat.getReg(0));
4166       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4167     }
4168   }
4169 
4170   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4171 
4172   // Optimize _L to _LZ when _L is zero
4173   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4174           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4175     const ConstantFP *ConstantLod;
4176 
4177     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4178                  m_GFCst(ConstantLod))) {
4179       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4180         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4181         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4182             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4183                                                       Intr->Dim);
4184 
4185         // The starting indexes should remain in the same place.
4186         --CorrectedNumVAddrs;
4187 
4188         MI.getOperand(MI.getNumExplicitDefs())
4189             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4190         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4191         Intr = NewImageDimIntr;
4192       }
4193     }
4194   }
4195 
4196   // Optimize _mip away, when 'lod' is zero
4197   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4198     int64_t ConstantLod;
4199     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4200                  m_ICst(ConstantLod))) {
4201       if (ConstantLod == 0) {
4202         // TODO: Change intrinsic opcode and remove operand instead or replacing
4203         // it with 0, as the _L to _LZ handling is done above.
4204         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4205         --CorrectedNumVAddrs;
4206       }
4207     }
4208   }
4209 
4210   // Rewrite the addressing register layout before doing anything else.
4211   if (IsA16 || IsG16) {
4212     if (IsA16) {
4213       // Target must support the feature and gradients need to be 16 bit too
4214       if (!ST.hasA16() || !IsG16)
4215         return false;
4216     } else if (!ST.hasG16())
4217       return false;
4218 
4219     if (Intr->NumVAddrs > 1) {
4220       SmallVector<Register, 4> PackedRegs;
4221       // Don't compress addresses for G16
4222       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4223       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4224                                   PackEndIdx);
4225 
4226       if (!IsA16) {
4227         // Add uncompressed address
4228         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4229           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4230           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4231           PackedRegs.push_back(AddrReg);
4232         }
4233       }
4234 
4235       // See also below in the non-a16 branch
4236       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4237 
4238       if (!UseNSA && PackedRegs.size() > 1) {
4239         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4240         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4241         PackedRegs[0] = Concat.getReg(0);
4242         PackedRegs.resize(1);
4243       }
4244 
4245       const unsigned NumPacked = PackedRegs.size();
4246       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4247         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4248         if (!SrcOp.isReg()) {
4249           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4250           continue;
4251         }
4252 
4253         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4254 
4255         if (I - Intr->VAddrStart < NumPacked)
4256           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4257         else
4258           SrcOp.setReg(AMDGPU::NoRegister);
4259       }
4260     }
4261   } else {
4262     // If the register allocator cannot place the address registers contiguously
4263     // without introducing moves, then using the non-sequential address encoding
4264     // is always preferable, since it saves VALU instructions and is usually a
4265     // wash in terms of code size or even better.
4266     //
4267     // However, we currently have no way of hinting to the register allocator
4268     // that MIMG addresses should be placed contiguously when it is possible to
4269     // do so, so force non-NSA for the common 2-address case as a heuristic.
4270     //
4271     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4272     // allocation when possible.
4273     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4274 
4275     if (!UseNSA && Intr->NumVAddrs > 1)
4276       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4277                                Intr->NumVAddrs);
4278   }
4279 
4280   int Flags = 0;
4281   if (IsA16)
4282     Flags |= 1;
4283   if (IsG16)
4284     Flags |= 2;
4285   MI.addOperand(MachineOperand::CreateImm(Flags));
4286 
4287   if (BaseOpcode->Store) { // No TFE for stores?
4288     // TODO: Handle dmask trim
4289     Register VData = MI.getOperand(1).getReg();
4290     LLT Ty = MRI->getType(VData);
4291     if (!Ty.isVector() || Ty.getElementType() != S16)
4292       return true;
4293 
4294     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4295     if (RepackedReg != VData) {
4296       MI.getOperand(1).setReg(RepackedReg);
4297     }
4298 
4299     return true;
4300   }
4301 
4302   Register DstReg = MI.getOperand(0).getReg();
4303   LLT Ty = MRI->getType(DstReg);
4304   const LLT EltTy = Ty.getScalarType();
4305   const bool IsD16 = Ty.getScalarType() == S16;
4306   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4307 
4308   // Confirm that the return type is large enough for the dmask specified
4309   if (NumElts < DMaskLanes)
4310     return false;
4311 
4312   if (NumElts > 4 || DMaskLanes > 4)
4313     return false;
4314 
4315   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4316   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4317 
4318   // The raw dword aligned data component of the load. The only legal cases
4319   // where this matters should be when using the packed D16 format, for
4320   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4321   LLT RoundedTy;
4322 
4323   // S32 vector to to cover all data, plus TFE result element.
4324   LLT TFETy;
4325 
4326   // Register type to use for each loaded component. Will be S32 or V2S16.
4327   LLT RegTy;
4328 
4329   if (IsD16 && ST.hasUnpackedD16VMem()) {
4330     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4331     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4332     RegTy = S32;
4333   } else {
4334     unsigned EltSize = EltTy.getSizeInBits();
4335     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4336     unsigned RoundedSize = 32 * RoundedElts;
4337     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4338     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4339     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4340   }
4341 
4342   // The return type does not need adjustment.
4343   // TODO: Should we change s16 case to s32 or <2 x s16>?
4344   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4345     return true;
4346 
4347   Register Dst1Reg;
4348 
4349   // Insert after the instruction.
4350   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4351 
4352   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4353   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4354   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4355   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4356 
4357   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4358 
4359   MI.getOperand(0).setReg(NewResultReg);
4360 
4361   // In the IR, TFE is supposed to be used with a 2 element struct return
4362   // type. The intruction really returns these two values in one contiguous
4363   // register, with one additional dword beyond the loaded data. Rewrite the
4364   // return type to use a single register result.
4365 
4366   if (IsTFE) {
4367     Dst1Reg = MI.getOperand(1).getReg();
4368     if (MRI->getType(Dst1Reg) != S32)
4369       return false;
4370 
4371     // TODO: Make sure the TFE operand bit is set.
4372     MI.RemoveOperand(1);
4373 
4374     // Handle the easy case that requires no repack instructions.
4375     if (Ty == S32) {
4376       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4377       return true;
4378     }
4379   }
4380 
4381   // Now figure out how to copy the new result register back into the old
4382   // result.
4383   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4384 
4385   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4386 
4387   if (ResultNumRegs == 1) {
4388     assert(!IsTFE);
4389     ResultRegs[0] = NewResultReg;
4390   } else {
4391     // We have to repack into a new vector of some kind.
4392     for (int I = 0; I != NumDataRegs; ++I)
4393       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4394     B.buildUnmerge(ResultRegs, NewResultReg);
4395 
4396     // Drop the final TFE element to get the data part. The TFE result is
4397     // directly written to the right place already.
4398     if (IsTFE)
4399       ResultRegs.resize(NumDataRegs);
4400   }
4401 
4402   // For an s16 scalar result, we form an s32 result with a truncate regardless
4403   // of packed vs. unpacked.
4404   if (IsD16 && !Ty.isVector()) {
4405     B.buildTrunc(DstReg, ResultRegs[0]);
4406     return true;
4407   }
4408 
4409   // Avoid a build/concat_vector of 1 entry.
4410   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4411     B.buildBitcast(DstReg, ResultRegs[0]);
4412     return true;
4413   }
4414 
4415   assert(Ty.isVector());
4416 
4417   if (IsD16) {
4418     // For packed D16 results with TFE enabled, all the data components are
4419     // S32. Cast back to the expected type.
4420     //
4421     // TODO: We don't really need to use load s32 elements. We would only need one
4422     // cast for the TFE result if a multiple of v2s16 was used.
4423     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4424       for (Register &Reg : ResultRegs)
4425         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4426     } else if (ST.hasUnpackedD16VMem()) {
4427       for (Register &Reg : ResultRegs)
4428         Reg = B.buildTrunc(S16, Reg).getReg(0);
4429     }
4430   }
4431 
4432   auto padWithUndef = [&](LLT Ty, int NumElts) {
4433     if (NumElts == 0)
4434       return;
4435     Register Undef = B.buildUndef(Ty).getReg(0);
4436     for (int I = 0; I != NumElts; ++I)
4437       ResultRegs.push_back(Undef);
4438   };
4439 
4440   // Pad out any elements eliminated due to the dmask.
4441   LLT ResTy = MRI->getType(ResultRegs[0]);
4442   if (!ResTy.isVector()) {
4443     padWithUndef(ResTy, NumElts - ResultRegs.size());
4444     B.buildBuildVector(DstReg, ResultRegs);
4445     return true;
4446   }
4447 
4448   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4449   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4450 
4451   // Deal with the one annoying legal case.
4452   const LLT V3S16 = LLT::vector(3, 16);
4453   if (Ty == V3S16) {
4454     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4455     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4456     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4457     return true;
4458   }
4459 
4460   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4461   B.buildConcatVectors(DstReg, ResultRegs);
4462   return true;
4463 }
4464 
4465 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4466   LegalizerHelper &Helper, MachineInstr &MI) const {
4467   MachineIRBuilder &B = Helper.MIRBuilder;
4468   GISelChangeObserver &Observer = Helper.Observer;
4469 
4470   Register Dst = MI.getOperand(0).getReg();
4471   LLT Ty = B.getMRI()->getType(Dst);
4472   unsigned Size = Ty.getSizeInBits();
4473   MachineFunction &MF = B.getMF();
4474 
4475   Observer.changingInstr(MI);
4476 
4477   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4478     Ty = getBitcastRegisterType(Ty);
4479     Helper.bitcastDst(MI, Ty, 0);
4480     Dst = MI.getOperand(0).getReg();
4481     B.setInsertPt(B.getMBB(), MI);
4482   }
4483 
4484   // FIXME: We don't really need this intermediate instruction. The intrinsic
4485   // should be fixed to have a memory operand. Since it's readnone, we're not
4486   // allowed to add one.
4487   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4488   MI.RemoveOperand(1); // Remove intrinsic ID
4489 
4490   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4491   // TODO: Should this use datalayout alignment?
4492   const unsigned MemSize = (Size + 7) / 8;
4493   const Align MemAlign(4);
4494   MachineMemOperand *MMO = MF.getMachineMemOperand(
4495       MachinePointerInfo(),
4496       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4497           MachineMemOperand::MOInvariant,
4498       MemSize, MemAlign);
4499   MI.addMemOperand(MF, MMO);
4500 
4501   // There are no 96-bit result scalar loads, but widening to 128-bit should
4502   // always be legal. We may need to restore this to a 96-bit result if it turns
4503   // out this needs to be converted to a vector load during RegBankSelect.
4504   if (!isPowerOf2_32(Size)) {
4505     if (Ty.isVector())
4506       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4507     else
4508       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4509   }
4510 
4511   Observer.changedInstr(MI);
4512   return true;
4513 }
4514 
4515 // TODO: Move to selection
4516 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4517                                                 MachineRegisterInfo &MRI,
4518                                                 MachineIRBuilder &B) const {
4519   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4520   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4521       !ST.isTrapHandlerEnabled()) {
4522     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4523   } else {
4524     // Pass queue pointer to trap handler as input, and insert trap instruction
4525     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4526     MachineRegisterInfo &MRI = *B.getMRI();
4527 
4528     Register LiveIn =
4529       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4530     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4531       return false;
4532 
4533     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4534     B.buildCopy(SGPR01, LiveIn);
4535     B.buildInstr(AMDGPU::S_TRAP)
4536         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4537         .addReg(SGPR01, RegState::Implicit);
4538   }
4539 
4540   MI.eraseFromParent();
4541   return true;
4542 }
4543 
4544 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4545     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4546   // Is non-HSA path or trap-handler disabled? then, report a warning
4547   // accordingly
4548   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4549       !ST.isTrapHandlerEnabled()) {
4550     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4551                                      "debugtrap handler not supported",
4552                                      MI.getDebugLoc(), DS_Warning);
4553     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4554     Ctx.diagnose(NoTrap);
4555   } else {
4556     // Insert debug-trap instruction
4557     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4558   }
4559 
4560   MI.eraseFromParent();
4561   return true;
4562 }
4563 
4564 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4565                                                MachineIRBuilder &B) const {
4566   MachineRegisterInfo &MRI = *B.getMRI();
4567   const LLT S16 = LLT::scalar(16);
4568   const LLT S32 = LLT::scalar(32);
4569 
4570   Register DstReg = MI.getOperand(0).getReg();
4571   Register NodePtr = MI.getOperand(2).getReg();
4572   Register RayExtent = MI.getOperand(3).getReg();
4573   Register RayOrigin = MI.getOperand(4).getReg();
4574   Register RayDir = MI.getOperand(5).getReg();
4575   Register RayInvDir = MI.getOperand(6).getReg();
4576   Register TDescr = MI.getOperand(7).getReg();
4577 
4578   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4579   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4580   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4581                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4582                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4583                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4584 
4585   SmallVector<Register, 12> Ops;
4586   if (Is64) {
4587     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4588     Ops.push_back(Unmerge.getReg(0));
4589     Ops.push_back(Unmerge.getReg(1));
4590   } else {
4591     Ops.push_back(NodePtr);
4592   }
4593   Ops.push_back(RayExtent);
4594 
4595   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4596     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4597     Ops.push_back(Unmerge.getReg(0));
4598     Ops.push_back(Unmerge.getReg(1));
4599     Ops.push_back(Unmerge.getReg(2));
4600   };
4601 
4602   packLanes(RayOrigin);
4603   if (IsA16) {
4604     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4605     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4606     Register R1 = MRI.createGenericVirtualRegister(S32);
4607     Register R2 = MRI.createGenericVirtualRegister(S32);
4608     Register R3 = MRI.createGenericVirtualRegister(S32);
4609     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4610     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4611     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4612     Ops.push_back(R1);
4613     Ops.push_back(R2);
4614     Ops.push_back(R3);
4615   } else {
4616     packLanes(RayDir);
4617     packLanes(RayInvDir);
4618   }
4619 
4620   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4621     .addDef(DstReg)
4622     .addImm(Opcode);
4623 
4624   for (Register R : Ops) {
4625     MIB.addUse(R);
4626   }
4627 
4628   MIB.addUse(TDescr)
4629      .addImm(IsA16 ? 1 : 0)
4630      .cloneMemRefs(MI);
4631 
4632   MI.eraseFromParent();
4633   return true;
4634 }
4635 
4636 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4637                                             MachineInstr &MI) const {
4638   MachineIRBuilder &B = Helper.MIRBuilder;
4639   MachineRegisterInfo &MRI = *B.getMRI();
4640 
4641   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4642   auto IntrID = MI.getIntrinsicID();
4643   switch (IntrID) {
4644   case Intrinsic::amdgcn_if:
4645   case Intrinsic::amdgcn_else: {
4646     MachineInstr *Br = nullptr;
4647     MachineBasicBlock *UncondBrTarget = nullptr;
4648     bool Negated = false;
4649     if (MachineInstr *BrCond =
4650             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4651       const SIRegisterInfo *TRI
4652         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4653 
4654       Register Def = MI.getOperand(1).getReg();
4655       Register Use = MI.getOperand(3).getReg();
4656 
4657       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4658 
4659       if (Negated)
4660         std::swap(CondBrTarget, UncondBrTarget);
4661 
4662       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4663       if (IntrID == Intrinsic::amdgcn_if) {
4664         B.buildInstr(AMDGPU::SI_IF)
4665           .addDef(Def)
4666           .addUse(Use)
4667           .addMBB(UncondBrTarget);
4668       } else {
4669         B.buildInstr(AMDGPU::SI_ELSE)
4670             .addDef(Def)
4671             .addUse(Use)
4672             .addMBB(UncondBrTarget);
4673       }
4674 
4675       if (Br) {
4676         Br->getOperand(0).setMBB(CondBrTarget);
4677       } else {
4678         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4679         // since we're swapping branch targets it needs to be reinserted.
4680         // FIXME: IRTranslator should probably not do this
4681         B.buildBr(*CondBrTarget);
4682       }
4683 
4684       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4685       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4686       MI.eraseFromParent();
4687       BrCond->eraseFromParent();
4688       return true;
4689     }
4690 
4691     return false;
4692   }
4693   case Intrinsic::amdgcn_loop: {
4694     MachineInstr *Br = nullptr;
4695     MachineBasicBlock *UncondBrTarget = nullptr;
4696     bool Negated = false;
4697     if (MachineInstr *BrCond =
4698             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4699       const SIRegisterInfo *TRI
4700         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4701 
4702       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4703       Register Reg = MI.getOperand(2).getReg();
4704 
4705       if (Negated)
4706         std::swap(CondBrTarget, UncondBrTarget);
4707 
4708       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4709       B.buildInstr(AMDGPU::SI_LOOP)
4710         .addUse(Reg)
4711         .addMBB(UncondBrTarget);
4712 
4713       if (Br)
4714         Br->getOperand(0).setMBB(CondBrTarget);
4715       else
4716         B.buildBr(*CondBrTarget);
4717 
4718       MI.eraseFromParent();
4719       BrCond->eraseFromParent();
4720       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4721       return true;
4722     }
4723 
4724     return false;
4725   }
4726   case Intrinsic::amdgcn_kernarg_segment_ptr:
4727     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4728       // This only makes sense to call in a kernel, so just lower to null.
4729       B.buildConstant(MI.getOperand(0).getReg(), 0);
4730       MI.eraseFromParent();
4731       return true;
4732     }
4733 
4734     return legalizePreloadedArgIntrin(
4735       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4736   case Intrinsic::amdgcn_implicitarg_ptr:
4737     return legalizeImplicitArgPtr(MI, MRI, B);
4738   case Intrinsic::amdgcn_workitem_id_x:
4739     return legalizePreloadedArgIntrin(MI, MRI, B,
4740                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4741   case Intrinsic::amdgcn_workitem_id_y:
4742     return legalizePreloadedArgIntrin(MI, MRI, B,
4743                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4744   case Intrinsic::amdgcn_workitem_id_z:
4745     return legalizePreloadedArgIntrin(MI, MRI, B,
4746                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4747   case Intrinsic::amdgcn_workgroup_id_x:
4748     return legalizePreloadedArgIntrin(MI, MRI, B,
4749                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4750   case Intrinsic::amdgcn_workgroup_id_y:
4751     return legalizePreloadedArgIntrin(MI, MRI, B,
4752                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4753   case Intrinsic::amdgcn_workgroup_id_z:
4754     return legalizePreloadedArgIntrin(MI, MRI, B,
4755                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4756   case Intrinsic::amdgcn_dispatch_ptr:
4757     return legalizePreloadedArgIntrin(MI, MRI, B,
4758                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4759   case Intrinsic::amdgcn_queue_ptr:
4760     return legalizePreloadedArgIntrin(MI, MRI, B,
4761                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4762   case Intrinsic::amdgcn_implicit_buffer_ptr:
4763     return legalizePreloadedArgIntrin(
4764       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4765   case Intrinsic::amdgcn_dispatch_id:
4766     return legalizePreloadedArgIntrin(MI, MRI, B,
4767                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4768   case Intrinsic::amdgcn_fdiv_fast:
4769     return legalizeFDIVFastIntrin(MI, MRI, B);
4770   case Intrinsic::amdgcn_is_shared:
4771     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4772   case Intrinsic::amdgcn_is_private:
4773     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4774   case Intrinsic::amdgcn_wavefrontsize: {
4775     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4776     MI.eraseFromParent();
4777     return true;
4778   }
4779   case Intrinsic::amdgcn_s_buffer_load:
4780     return legalizeSBufferLoad(Helper, MI);
4781   case Intrinsic::amdgcn_raw_buffer_store:
4782   case Intrinsic::amdgcn_struct_buffer_store:
4783     return legalizeBufferStore(MI, MRI, B, false, false);
4784   case Intrinsic::amdgcn_raw_buffer_store_format:
4785   case Intrinsic::amdgcn_struct_buffer_store_format:
4786     return legalizeBufferStore(MI, MRI, B, false, true);
4787   case Intrinsic::amdgcn_raw_tbuffer_store:
4788   case Intrinsic::amdgcn_struct_tbuffer_store:
4789     return legalizeBufferStore(MI, MRI, B, true, true);
4790   case Intrinsic::amdgcn_raw_buffer_load:
4791   case Intrinsic::amdgcn_struct_buffer_load:
4792     return legalizeBufferLoad(MI, MRI, B, false, false);
4793   case Intrinsic::amdgcn_raw_buffer_load_format:
4794   case Intrinsic::amdgcn_struct_buffer_load_format:
4795     return legalizeBufferLoad(MI, MRI, B, true, false);
4796   case Intrinsic::amdgcn_raw_tbuffer_load:
4797   case Intrinsic::amdgcn_struct_tbuffer_load:
4798     return legalizeBufferLoad(MI, MRI, B, true, true);
4799   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4800   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4801   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4802   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4803   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4804   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4805   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4806   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4807   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4808   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4809   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4810   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4811   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4812   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4813   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4814   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4815   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4816   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4817   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4818   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4819   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4820   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4821   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4822   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4823   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4824   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4825   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4826   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4827   case Intrinsic::amdgcn_buffer_atomic_fadd:
4828   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4829   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4830   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4831   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4832     return legalizeBufferAtomic(MI, B, IntrID);
4833   case Intrinsic::amdgcn_atomic_inc:
4834     return legalizeAtomicIncDec(MI, B, true);
4835   case Intrinsic::amdgcn_atomic_dec:
4836     return legalizeAtomicIncDec(MI, B, false);
4837   case Intrinsic::trap:
4838     return legalizeTrapIntrinsic(MI, MRI, B);
4839   case Intrinsic::debugtrap:
4840     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4841   case Intrinsic::amdgcn_rsq_clamp:
4842     return legalizeRsqClampIntrinsic(MI, MRI, B);
4843   case Intrinsic::amdgcn_ds_fadd:
4844   case Intrinsic::amdgcn_ds_fmin:
4845   case Intrinsic::amdgcn_ds_fmax:
4846     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4847   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4848     return legalizeBVHIntrinsic(MI, B);
4849   default: {
4850     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4851             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4852       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4853     return true;
4854   }
4855   }
4856 
4857   return true;
4858 }
4859