1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38   "amdgpu-global-isel-new-legality",
39   cl::desc("Use GlobalISel desired legality, rather than try to use"
40            "rules compatible with selection patterns"),
41   cl::init(false),
42   cl::ReallyHidden);
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     if (!Ty.isVector())
67       return false;
68 
69     const LLT EltTy = Ty.getElementType();
70     const unsigned EltSize = EltTy.getSizeInBits();
71     return Ty.getNumElements() % 2 != 0 &&
72            EltSize > 1 && EltSize < 32 &&
73            Ty.getSizeInBits() % 32 != 0;
74   };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78   return [=](const LegalityQuery &Query) {
79     const LLT Ty = Query.Types[TypeIdx];
80     return Ty.getSizeInBits() % 32 == 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130   const unsigned Size = Ty.getSizeInBits();
131 
132   LLT CoercedTy;
133   if (Size <= 32) {
134     // <2 x s8> -> s16
135     // <4 x s8> -> s32
136     return LLT::scalar(Size);
137   }
138 
139   return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143   return [=](const LegalityQuery &Query) {
144     const LLT Ty = Query.Types[TypeIdx];
145     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146   };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT Ty = Query.Types[TypeIdx];
152     unsigned Size = Ty.getSizeInBits();
153     assert(Size % 32 == 0);
154     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155   };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159   return [=](const LegalityQuery &Query) {
160     const LLT QueryTy = Query.Types[TypeIdx];
161     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162   };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166   return [=](const LegalityQuery &Query) {
167     const LLT QueryTy = Query.Types[TypeIdx];
168     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169   };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173   return [=](const LegalityQuery &Query) {
174     const LLT QueryTy = Query.Types[TypeIdx];
175     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176   };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180   return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184   const int EltSize = EltTy.getSizeInBits();
185   return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189   const int EltSize = Ty.getElementType().getSizeInBits();
190   return EltSize == 32 || EltSize == 64 ||
191          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192          EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196   if (!isRegisterSize(Ty.getSizeInBits()))
197     return false;
198 
199   if (Ty.isVector())
200     return isRegisterVectorType(Ty);
201 
202   return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208   return [=](const LegalityQuery &Query) {
209     return isRegisterType(Query.Types[TypeIdx]);
210   };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     const LLT QueryTy = Query.Types[TypeIdx];
216     if (!QueryTy.isVector())
217       return false;
218     const LLT EltTy = QueryTy.getElementType();
219     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220   };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224   return [=](const LegalityQuery &Query) {
225     const LLT Ty = Query.Types[TypeIdx];
226     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228   };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235                                     bool IsLoad) {
236   switch (AS) {
237   case AMDGPUAS::PRIVATE_ADDRESS:
238     // FIXME: Private element size.
239     return ST.enableFlatScratch() ? 128 : 32;
240   case AMDGPUAS::LOCAL_ADDRESS:
241     return ST.useDS128() ? 128 : 64;
242   case AMDGPUAS::GLOBAL_ADDRESS:
243   case AMDGPUAS::CONSTANT_ADDRESS:
244   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245     // Treat constant and global as identical. SMRD loads are sometimes usable for
246     // global loads (ideally constant address space should be eliminated)
247     // depending on the context. Legality cannot be context dependent, but
248     // RegBankSelect can split the load as necessary depending on the pointer
249     // register bank/uniformity and if the memory is invariant or not written in a
250     // kernel.
251     return IsLoad ? 512 : 128;
252   default:
253     // Flat addresses may contextually need to be split to 32-bit parts if they
254     // may alias scratch depending on the subtarget.
255     return 128;
256   }
257 }
258 
259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260                                  const LegalityQuery &Query,
261                                  unsigned Opcode) {
262   const LLT Ty = Query.Types[0];
263 
264   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265   const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267   unsigned RegSize = Ty.getSizeInBits();
268   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270   unsigned AS = Query.Types[1].getAddressSpace();
271 
272   // All of these need to be custom lowered to cast the pointer operand.
273   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274     return false;
275 
276   // TODO: We should be able to widen loads if the alignment is high enough, but
277   // we also need to modify the memory access size.
278 #if 0
279   // Accept widening loads based on alignment.
280   if (IsLoad && MemSize < Size)
281     MemSize = std::max(MemSize, Align);
282 #endif
283 
284   // Only 1-byte and 2-byte to 32-bit extloads are valid.
285   if (MemSize != RegSize && RegSize != 32)
286     return false;
287 
288   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289     return false;
290 
291   switch (MemSize) {
292   case 8:
293   case 16:
294   case 32:
295   case 64:
296   case 128:
297     break;
298   case 96:
299     if (!ST.hasDwordx3LoadStores())
300       return false;
301     break;
302   case 256:
303   case 512:
304     // These may contextually need to be broken down.
305     break;
306   default:
307     return false;
308   }
309 
310   assert(RegSize >= MemSize);
311 
312   if (AlignBits < MemSize) {
313     const SITargetLowering *TLI = ST.getTargetLowering();
314     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315                                                  Align(AlignBits / 8)))
316       return false;
317   }
318 
319   return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327   if (EnableNewLegality)
328     return false;
329 
330   const unsigned Size = Ty.getSizeInBits();
331   if (Size <= 64)
332     return false;
333   if (!Ty.isVector())
334     return true;
335 
336   LLT EltTy = Ty.getElementType();
337   if (EltTy.isPointer())
338     return true;
339 
340   unsigned EltSize = EltTy.getSizeInBits();
341   return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345                              unsigned Opcode) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583     .customFor({S32, S64})
584     .clampScalar(0, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .scalarize(0);
587 
588   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589                    .legalFor({S32})
590                    .maxScalarOrElt(0, S32);
591 
592   if (ST.hasVOP3PInsts()) {
593     Mulh
594       .clampMaxNumElements(0, S8, 2)
595       .lowerFor({V2S8});
596   }
597 
598   Mulh
599     .scalarize(0)
600     .lower();
601 
602   // Report legal for any types we can handle anywhere. For the cases only legal
603   // on the SALU, RegBankSelect will be able to re-legalize.
604   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606     .clampScalar(0, S32, S64)
607     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609     .widenScalarToNextPow2(0)
610     .scalarize(0);
611 
612   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614     .legalFor({{S32, S1}, {S32, S32}})
615     .minScalar(0, S32)
616     // TODO: .scalarize(0)
617     .lower();
618 
619   getActionDefinitionsBuilder(G_BITCAST)
620     // Don't worry about the size constraint.
621     .legalIf(all(isRegisterType(0), isRegisterType(1)))
622     .lower();
623 
624 
625   getActionDefinitionsBuilder(G_CONSTANT)
626     .legalFor({S1, S32, S64, S16, GlobalPtr,
627                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628     .legalIf(isPointer(0))
629     .clampScalar(0, S32, S64)
630     .widenScalarToNextPow2(0);
631 
632   getActionDefinitionsBuilder(G_FCONSTANT)
633     .legalFor({S32, S64, S16})
634     .clampScalar(0, S16, S64);
635 
636   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637       .legalIf(isRegisterType(0))
638       // s1 and s16 are special cases because they have legal operations on
639       // them, but don't really occupy registers in the normal way.
640       .legalFor({S1, S16})
641       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642       .clampScalarOrElt(0, S32, MaxScalar)
643       .widenScalarToNextPow2(0, 32)
644       .clampMaxNumElements(0, S32, 16);
645 
646   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648   // If the amount is divergent, we have to do a wave reduction to get the
649   // maximum value, so this is expanded during RegBankSelect.
650   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651     .legalFor({{PrivatePtr, S32}});
652 
653   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654     .customIf(typeIsNot(0, PrivatePtr));
655 
656   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658   auto &FPOpActions = getActionDefinitionsBuilder(
659     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660     .legalFor({S32, S64});
661   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662     .customFor({S32, S64});
663   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664     .customFor({S32, S64});
665 
666   if (ST.has16BitInsts()) {
667     if (ST.hasVOP3PInsts())
668       FPOpActions.legalFor({S16, V2S16});
669     else
670       FPOpActions.legalFor({S16});
671 
672     TrigActions.customFor({S16});
673     FDIVActions.customFor({S16});
674   }
675 
676   auto &MinNumMaxNum = getActionDefinitionsBuilder({
677       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679   if (ST.hasVOP3PInsts()) {
680     MinNumMaxNum.customFor(FPTypesPK16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .clampMaxNumElements(0, S16, 2)
683       .clampScalar(0, S16, S64)
684       .scalarize(0);
685   } else if (ST.has16BitInsts()) {
686     MinNumMaxNum.customFor(FPTypes16)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else {
690     MinNumMaxNum.customFor(FPTypesBase)
691       .clampScalar(0, S32, S64)
692       .scalarize(0);
693   }
694 
695   if (ST.hasVOP3PInsts())
696     FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698   FPOpActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   TrigActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   FDIVActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   getActionDefinitionsBuilder({G_FNEG, G_FABS})
711     .legalFor(FPTypesPK16)
712     .clampMaxNumElements(0, S16, 2)
713     .scalarize(0)
714     .clampScalar(0, S16, S64);
715 
716   if (ST.has16BitInsts()) {
717     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718       .legalFor({S32, S64, S16})
719       .scalarize(0)
720       .clampScalar(0, S16, S64);
721   } else {
722     getActionDefinitionsBuilder(G_FSQRT)
723       .legalFor({S32, S64})
724       .scalarize(0)
725       .clampScalar(0, S32, S64);
726 
727     if (ST.hasFractBug()) {
728       getActionDefinitionsBuilder(G_FFLOOR)
729         .customFor({S64})
730         .legalFor({S32, S64})
731         .scalarize(0)
732         .clampScalar(0, S32, S64);
733     } else {
734       getActionDefinitionsBuilder(G_FFLOOR)
735         .legalFor({S32, S64})
736         .scalarize(0)
737         .clampScalar(0, S32, S64);
738     }
739   }
740 
741   getActionDefinitionsBuilder(G_FPTRUNC)
742     .legalFor({{S32, S64}, {S16, S32}})
743     .scalarize(0)
744     .lower();
745 
746   getActionDefinitionsBuilder(G_FPEXT)
747     .legalFor({{S64, S32}, {S32, S16}})
748     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749     .scalarize(0);
750 
751   getActionDefinitionsBuilder(G_FSUB)
752       // Use actual fsub instruction
753       .legalFor({S32})
754       // Must use fadd + fneg
755       .lowerFor({S64, S16, V2S16})
756       .scalarize(0)
757       .clampScalar(0, S32, S64);
758 
759   // Whether this is legal depends on the floating point mode for the function.
760   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762     FMad.customFor({S32, S16});
763   else if (ST.hasMadMacF32Insts())
764     FMad.customFor({S32});
765   else if (ST.hasMadF16())
766     FMad.customFor({S16});
767   FMad.scalarize(0)
768       .lower();
769 
770   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771   if (ST.has16BitInsts()) {
772     FRem.customFor({S16, S32, S64});
773   } else {
774     FRem.minScalar(0, S32)
775         .customFor({S32, S64});
776   }
777   FRem.scalarize(0);
778 
779   // TODO: Do we need to clamp maximum bitwidth?
780   getActionDefinitionsBuilder(G_TRUNC)
781     .legalIf(isScalar(0))
782     .legalFor({{V2S16, V2S32}})
783     .clampMaxNumElements(0, S16, 2)
784     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785     // situations (like an invalid implicit use), we don't want to infinite loop
786     // in the legalizer.
787     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788     .alwaysLegal();
789 
790   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792                {S32, S1}, {S64, S1}, {S16, S1}})
793     .scalarize(0)
794     .clampScalar(0, S32, S64)
795     .widenScalarToNextPow2(1, 32);
796 
797   // TODO: Split s1->s64 during regbankselect for VALU.
798   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800     .lowerFor({{S32, S64}})
801     .lowerIf(typeIs(1, S1))
802     .customFor({{S64, S64}});
803   if (ST.has16BitInsts())
804     IToFP.legalFor({{S16, S16}});
805   IToFP.clampScalar(1, S32, S64)
806        .minScalar(0, S32)
807        .scalarize(0)
808        .widenScalarToNextPow2(1);
809 
810   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812     .customFor({{S64, S64}})
813     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814   if (ST.has16BitInsts())
815     FPToI.legalFor({{S16, S16}});
816   else
817     FPToI.minScalar(1, S32);
818 
819   FPToI.minScalar(0, S32)
820        .scalarize(0)
821        .lower();
822 
823   // Lower roundeven into G_FRINT
824   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825     .scalarize(0)
826     .lower();
827 
828   if (ST.has16BitInsts()) {
829     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830       .legalFor({S16, S32, S64})
831       .clampScalar(0, S16, S64)
832       .scalarize(0);
833   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835       .legalFor({S32, S64})
836       .clampScalar(0, S32, S64)
837       .scalarize(0);
838   } else {
839     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840       .legalFor({S32})
841       .customFor({S64})
842       .clampScalar(0, S32, S64)
843       .scalarize(0);
844   }
845 
846   getActionDefinitionsBuilder(G_PTR_ADD)
847     .legalIf(all(isPointer(0), sameSize(0, 1)))
848     .scalarize(0)
849     .scalarSameSizeAs(1, 0);
850 
851   getActionDefinitionsBuilder(G_PTRMASK)
852     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853     .scalarSameSizeAs(1, 0)
854     .scalarize(0);
855 
856   auto &CmpBuilder =
857     getActionDefinitionsBuilder(G_ICMP)
858     // The compare output type differs based on the register bank of the output,
859     // so make both s1 and s32 legal.
860     //
861     // Scalar compares producing output in scc will be promoted to s32, as that
862     // is the allocatable register type that will be needed for the copy from
863     // scc. This will be promoted during RegBankSelect, and we assume something
864     // before that won't try to use s32 result types.
865     //
866     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867     // bank.
868     .legalForCartesianProduct(
869       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870     .legalForCartesianProduct(
871       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872   if (ST.has16BitInsts()) {
873     CmpBuilder.legalFor({{S1, S16}});
874   }
875 
876   CmpBuilder
877     .widenScalarToNextPow2(1)
878     .clampScalar(1, S32, S64)
879     .scalarize(0)
880     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882   getActionDefinitionsBuilder(G_FCMP)
883     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884     .widenScalarToNextPow2(1)
885     .clampScalar(1, S32, S64)
886     .scalarize(0);
887 
888   // FIXME: fpow has a selection pattern that should move to custom lowering.
889   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890   if (ST.has16BitInsts())
891     Exp2Ops.legalFor({S32, S16});
892   else
893     Exp2Ops.legalFor({S32});
894   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895   Exp2Ops.scalarize(0);
896 
897   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898   if (ST.has16BitInsts())
899     ExpOps.customFor({{S32}, {S16}});
900   else
901     ExpOps.customFor({S32});
902   ExpOps.clampScalar(0, MinScalarFPTy, S32)
903         .scalarize(0);
904 
905   getActionDefinitionsBuilder(G_FPOWI)
906     .clampScalar(0, MinScalarFPTy, S32)
907     .lower();
908 
909   // The 64-bit versions produce 32-bit results, but only on the SALU.
910   getActionDefinitionsBuilder(G_CTPOP)
911     .legalFor({{S32, S32}, {S32, S64}})
912     .clampScalar(0, S32, S32)
913     .clampScalar(1, S32, S64)
914     .scalarize(0)
915     .widenScalarToNextPow2(0, 32)
916     .widenScalarToNextPow2(1, 32);
917 
918   // The hardware instructions return a different result on 0 than the generic
919   // instructions expect. The hardware produces -1, but these produce the
920   // bitwidth.
921   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922     .scalarize(0)
923     .clampScalar(0, S32, S32)
924     .clampScalar(1, S32, S64)
925     .widenScalarToNextPow2(0, 32)
926     .widenScalarToNextPow2(1, 32)
927     .lower();
928 
929   // The 64-bit versions produce 32-bit results, but only on the SALU.
930   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931     .legalFor({{S32, S32}, {S32, S64}})
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32)
936     .widenScalarToNextPow2(1, 32);
937 
938   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
939   // RegBankSelect.
940   getActionDefinitionsBuilder(G_BITREVERSE)
941     .legalFor({S32, S64})
942     .clampScalar(0, S32, S64)
943     .scalarize(0)
944     .widenScalarToNextPow2(0);
945 
946   if (ST.has16BitInsts()) {
947     getActionDefinitionsBuilder(G_BSWAP)
948       .legalFor({S16, S32, V2S16})
949       .clampMaxNumElements(0, S16, 2)
950       // FIXME: Fixing non-power-of-2 before clamp is workaround for
951       // narrowScalar limitation.
952       .widenScalarToNextPow2(0)
953       .clampScalar(0, S16, S32)
954       .scalarize(0);
955 
956     if (ST.hasVOP3PInsts()) {
957       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
958         .legalFor({S32, S16, V2S16})
959         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
960         .clampMaxNumElements(0, S16, 2)
961         .minScalar(0, S16)
962         .widenScalarToNextPow2(0)
963         .scalarize(0)
964         .lower();
965     } else {
966       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
967         .legalFor({S32, S16})
968         .widenScalarToNextPow2(0)
969         .minScalar(0, S16)
970         .scalarize(0)
971         .lower();
972     }
973   } else {
974     // TODO: Should have same legality without v_perm_b32
975     getActionDefinitionsBuilder(G_BSWAP)
976       .legalFor({S32})
977       .lowerIf(scalarNarrowerThan(0, 32))
978       // FIXME: Fixing non-power-of-2 before clamp is workaround for
979       // narrowScalar limitation.
980       .widenScalarToNextPow2(0)
981       .maxScalar(0, S32)
982       .scalarize(0)
983       .lower();
984 
985     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
986       .legalFor({S32})
987       .minScalar(0, S32)
988       .widenScalarToNextPow2(0)
989       .scalarize(0)
990       .lower();
991   }
992 
993   getActionDefinitionsBuilder(G_INTTOPTR)
994     // List the common cases
995     .legalForCartesianProduct(AddrSpaces64, {S64})
996     .legalForCartesianProduct(AddrSpaces32, {S32})
997     .scalarize(0)
998     // Accept any address space as long as the size matches
999     .legalIf(sameSize(0, 1))
1000     .widenScalarIf(smallerThan(1, 0),
1001       [](const LegalityQuery &Query) {
1002         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1003       })
1004     .narrowScalarIf(largerThan(1, 0),
1005       [](const LegalityQuery &Query) {
1006         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1007       });
1008 
1009   getActionDefinitionsBuilder(G_PTRTOINT)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(0, 1),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1019       })
1020     .narrowScalarIf(
1021       largerThan(0, 1),
1022       [](const LegalityQuery &Query) {
1023         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1024       });
1025 
1026   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1027     .scalarize(0)
1028     .custom();
1029 
1030   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1031                                     bool IsLoad) -> bool {
1032     const LLT DstTy = Query.Types[0];
1033 
1034     // Split vector extloads.
1035     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1036     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1037 
1038     if (MemSize < DstTy.getSizeInBits())
1039       MemSize = std::max(MemSize, AlignBits);
1040 
1041     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1042       return true;
1043 
1044     const LLT PtrTy = Query.Types[1];
1045     unsigned AS = PtrTy.getAddressSpace();
1046     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1047       return true;
1048 
1049     // Catch weird sized loads that don't evenly divide into the access sizes
1050     // TODO: May be able to widen depending on alignment etc.
1051     unsigned NumRegs = (MemSize + 31) / 32;
1052     if (NumRegs == 3) {
1053       if (!ST.hasDwordx3LoadStores())
1054         return true;
1055     } else {
1056       // If the alignment allows, these should have been widened.
1057       if (!isPowerOf2_32(NumRegs))
1058         return true;
1059     }
1060 
1061     if (AlignBits < MemSize) {
1062       const SITargetLowering *TLI = ST.getTargetLowering();
1063       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1064                                                       Align(AlignBits / 8));
1065     }
1066 
1067     return false;
1068   };
1069 
1070   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1071   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1072   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1073 
1074   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1075   // LDS
1076   // TODO: Unsupported flat for SI.
1077 
1078   for (unsigned Op : {G_LOAD, G_STORE}) {
1079     const bool IsStore = Op == G_STORE;
1080 
1081     auto &Actions = getActionDefinitionsBuilder(Op);
1082     // Explicitly list some common cases.
1083     // TODO: Does this help compile time at all?
1084     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1085                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1086                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1087                                       {S64, GlobalPtr, 64, GlobalAlign32},
1088                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1089                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1090                                       {S32, GlobalPtr, 8, GlobalAlign8},
1091                                       {S32, GlobalPtr, 16, GlobalAlign16},
1092 
1093                                       {S32, LocalPtr, 32, 32},
1094                                       {S64, LocalPtr, 64, 32},
1095                                       {V2S32, LocalPtr, 64, 32},
1096                                       {S32, LocalPtr, 8, 8},
1097                                       {S32, LocalPtr, 16, 16},
1098                                       {V2S16, LocalPtr, 32, 32},
1099 
1100                                       {S32, PrivatePtr, 32, 32},
1101                                       {S32, PrivatePtr, 8, 8},
1102                                       {S32, PrivatePtr, 16, 16},
1103                                       {V2S16, PrivatePtr, 32, 32},
1104 
1105                                       {S32, ConstantPtr, 32, GlobalAlign32},
1106                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1107                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1108                                       {S64, ConstantPtr, 64, GlobalAlign32},
1109                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1110     Actions.legalIf(
1111       [=](const LegalityQuery &Query) -> bool {
1112         return isLoadStoreLegal(ST, Query, Op);
1113       });
1114 
1115     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1116     // 64-bits.
1117     //
1118     // TODO: Should generalize bitcast action into coerce, which will also cover
1119     // inserting addrspacecasts.
1120     Actions.customIf(typeIs(1, Constant32Ptr));
1121 
1122     // Turn any illegal element vectors into something easier to deal
1123     // with. These will ultimately produce 32-bit scalar shifts to extract the
1124     // parts anyway.
1125     //
1126     // For odd 16-bit element vectors, prefer to split those into pieces with
1127     // 16-bit vector parts.
1128     Actions.bitcastIf(
1129       [=](const LegalityQuery &Query) -> bool {
1130         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1131                                           Query.MMODescrs[0].SizeInBits);
1132       }, bitcastToRegisterType(0));
1133 
1134     if (!IsStore) {
1135       // Widen suitably aligned loads by loading extra bytes. The standard
1136       // legalization actions can't properly express widening memory operands.
1137       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1138         return shouldWidenLoad(ST, Query, G_LOAD);
1139       });
1140     }
1141 
1142     // FIXME: load/store narrowing should be moved to lower action
1143     Actions
1144         .narrowScalarIf(
1145             [=](const LegalityQuery &Query) -> bool {
1146               return !Query.Types[0].isVector() &&
1147                      needToSplitMemOp(Query, Op == G_LOAD);
1148             },
1149             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1150               const LLT DstTy = Query.Types[0];
1151               const LLT PtrTy = Query.Types[1];
1152 
1153               const unsigned DstSize = DstTy.getSizeInBits();
1154               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1155 
1156               // Split extloads.
1157               if (DstSize > MemSize)
1158                 return std::make_pair(0, LLT::scalar(MemSize));
1159 
1160               if (!isPowerOf2_32(DstSize)) {
1161                 // We're probably decomposing an odd sized store. Try to split
1162                 // to the widest type. TODO: Account for alignment. As-is it
1163                 // should be OK, since the new parts will be further legalized.
1164                 unsigned FloorSize = PowerOf2Floor(DstSize);
1165                 return std::make_pair(0, LLT::scalar(FloorSize));
1166               }
1167 
1168               if (DstSize > 32 && (DstSize % 32 != 0)) {
1169                 // FIXME: Need a way to specify non-extload of larger size if
1170                 // suitably aligned.
1171                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1172               }
1173 
1174               unsigned MaxSize = maxSizeForAddrSpace(ST,
1175                                                      PtrTy.getAddressSpace(),
1176                                                      Op == G_LOAD);
1177               if (MemSize > MaxSize)
1178                 return std::make_pair(0, LLT::scalar(MaxSize));
1179 
1180               unsigned Align = Query.MMODescrs[0].AlignInBits;
1181               return std::make_pair(0, LLT::scalar(Align));
1182             })
1183         .fewerElementsIf(
1184             [=](const LegalityQuery &Query) -> bool {
1185               return Query.Types[0].isVector() &&
1186                      needToSplitMemOp(Query, Op == G_LOAD);
1187             },
1188             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1189               const LLT DstTy = Query.Types[0];
1190               const LLT PtrTy = Query.Types[1];
1191 
1192               LLT EltTy = DstTy.getElementType();
1193               unsigned MaxSize = maxSizeForAddrSpace(ST,
1194                                                      PtrTy.getAddressSpace(),
1195                                                      Op == G_LOAD);
1196 
1197               // FIXME: Handle widened to power of 2 results better. This ends
1198               // up scalarizing.
1199               // FIXME: 3 element stores scalarized on SI
1200 
1201               // Split if it's too large for the address space.
1202               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1203                 unsigned NumElts = DstTy.getNumElements();
1204                 unsigned EltSize = EltTy.getSizeInBits();
1205 
1206                 if (MaxSize % EltSize == 0) {
1207                   return std::make_pair(
1208                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1209                 }
1210 
1211                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1212 
1213                 // FIXME: Refine when odd breakdowns handled
1214                 // The scalars will need to be re-legalized.
1215                 if (NumPieces == 1 || NumPieces >= NumElts ||
1216                     NumElts % NumPieces != 0)
1217                   return std::make_pair(0, EltTy);
1218 
1219                 return std::make_pair(0,
1220                                       LLT::vector(NumElts / NumPieces, EltTy));
1221               }
1222 
1223               // FIXME: We could probably handle weird extending loads better.
1224               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1225               if (DstTy.getSizeInBits() > MemSize)
1226                 return std::make_pair(0, EltTy);
1227 
1228               unsigned EltSize = EltTy.getSizeInBits();
1229               unsigned DstSize = DstTy.getSizeInBits();
1230               if (!isPowerOf2_32(DstSize)) {
1231                 // We're probably decomposing an odd sized store. Try to split
1232                 // to the widest type. TODO: Account for alignment. As-is it
1233                 // should be OK, since the new parts will be further legalized.
1234                 unsigned FloorSize = PowerOf2Floor(DstSize);
1235                 return std::make_pair(
1236                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1237               }
1238 
1239               // Need to split because of alignment.
1240               unsigned Align = Query.MMODescrs[0].AlignInBits;
1241               if (EltSize > Align &&
1242                   (EltSize / Align < DstTy.getNumElements())) {
1243                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1244               }
1245 
1246               // May need relegalization for the scalars.
1247               return std::make_pair(0, EltTy);
1248             })
1249     .lowerIfMemSizeNotPow2()
1250     .minScalar(0, S32);
1251 
1252     if (IsStore)
1253       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1254 
1255     Actions
1256         .widenScalarToNextPow2(0)
1257         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1258         .lower();
1259   }
1260 
1261   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1262                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1263                                                   {S32, GlobalPtr, 16, 2 * 8},
1264                                                   {S32, LocalPtr, 8, 8},
1265                                                   {S32, LocalPtr, 16, 16},
1266                                                   {S32, PrivatePtr, 8, 8},
1267                                                   {S32, PrivatePtr, 16, 16},
1268                                                   {S32, ConstantPtr, 8, 8},
1269                                                   {S32, ConstantPtr, 16, 2 * 8}});
1270   if (ST.hasFlatAddressSpace()) {
1271     ExtLoads.legalForTypesWithMemDesc(
1272         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1273   }
1274 
1275   ExtLoads.clampScalar(0, S32, S32)
1276           .widenScalarToNextPow2(0)
1277           .unsupportedIfMemSizeNotPow2()
1278           .lower();
1279 
1280   auto &Atomics = getActionDefinitionsBuilder(
1281     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1282      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1283      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1284      G_ATOMICRMW_UMIN})
1285     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1286                {S64, GlobalPtr}, {S64, LocalPtr},
1287                {S32, RegionPtr}, {S64, RegionPtr}});
1288   if (ST.hasFlatAddressSpace()) {
1289     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1290   }
1291 
1292   if (ST.hasLDSFPAtomics()) {
1293     auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1294       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1295     if (ST.hasGFX90AInsts())
1296       Atomic.legalFor({{S64, LocalPtr}});
1297   }
1298 
1299   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1300   // demarshalling
1301   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1302     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1303                 {S32, FlatPtr}, {S64, FlatPtr}})
1304     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1305                {S32, RegionPtr}, {S64, RegionPtr}});
1306   // TODO: Pointer types, any 32-bit or 64-bit vector
1307 
1308   // Condition should be s32 for scalar, s1 for vector.
1309   getActionDefinitionsBuilder(G_SELECT)
1310     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1311           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1312           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1313     .clampScalar(0, S16, S64)
1314     .scalarize(1)
1315     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1316     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1317     .clampMaxNumElements(0, S32, 2)
1318     .clampMaxNumElements(0, LocalPtr, 2)
1319     .clampMaxNumElements(0, PrivatePtr, 2)
1320     .scalarize(0)
1321     .widenScalarToNextPow2(0)
1322     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1323 
1324   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1325   // be more flexible with the shift amount type.
1326   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1327     .legalFor({{S32, S32}, {S64, S32}});
1328   if (ST.has16BitInsts()) {
1329     if (ST.hasVOP3PInsts()) {
1330       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1331             .clampMaxNumElements(0, S16, 2);
1332     } else
1333       Shifts.legalFor({{S16, S16}});
1334 
1335     // TODO: Support 16-bit shift amounts for all types
1336     Shifts.widenScalarIf(
1337       [=](const LegalityQuery &Query) {
1338         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1339         // 32-bit amount.
1340         const LLT ValTy = Query.Types[0];
1341         const LLT AmountTy = Query.Types[1];
1342         return ValTy.getSizeInBits() <= 16 &&
1343                AmountTy.getSizeInBits() < 16;
1344       }, changeTo(1, S16));
1345     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1346     Shifts.clampScalar(1, S32, S32);
1347     Shifts.clampScalar(0, S16, S64);
1348     Shifts.widenScalarToNextPow2(0, 16);
1349 
1350     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1351       .minScalar(0, S16)
1352       .scalarize(0)
1353       .lower();
1354   } else {
1355     // Make sure we legalize the shift amount type first, as the general
1356     // expansion for the shifted type will produce much worse code if it hasn't
1357     // been truncated already.
1358     Shifts.clampScalar(1, S32, S32);
1359     Shifts.clampScalar(0, S32, S64);
1360     Shifts.widenScalarToNextPow2(0, 32);
1361 
1362     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1363       .minScalar(0, S32)
1364       .scalarize(0)
1365       .lower();
1366   }
1367   Shifts.scalarize(0);
1368 
1369   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1370     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1371     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1372     unsigned IdxTypeIdx = 2;
1373 
1374     getActionDefinitionsBuilder(Op)
1375       .customIf([=](const LegalityQuery &Query) {
1376           const LLT EltTy = Query.Types[EltTypeIdx];
1377           const LLT VecTy = Query.Types[VecTypeIdx];
1378           const LLT IdxTy = Query.Types[IdxTypeIdx];
1379           const unsigned EltSize = EltTy.getSizeInBits();
1380           return (EltSize == 32 || EltSize == 64) &&
1381                   VecTy.getSizeInBits() % 32 == 0 &&
1382                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1383                   IdxTy.getSizeInBits() == 32;
1384         })
1385       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1386                  bitcastToVectorElement32(VecTypeIdx))
1387       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1388       .bitcastIf(
1389         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1390         [=](const LegalityQuery &Query) {
1391           // For > 64-bit element types, try to turn this into a 64-bit
1392           // element vector since we may be able to do better indexing
1393           // if this is scalar. If not, fall back to 32.
1394           const LLT EltTy = Query.Types[EltTypeIdx];
1395           const LLT VecTy = Query.Types[VecTypeIdx];
1396           const unsigned DstEltSize = EltTy.getSizeInBits();
1397           const unsigned VecSize = VecTy.getSizeInBits();
1398 
1399           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1400           return std::make_pair(
1401             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1402         })
1403       .clampScalar(EltTypeIdx, S32, S64)
1404       .clampScalar(VecTypeIdx, S32, S64)
1405       .clampScalar(IdxTypeIdx, S32, S32)
1406       .clampMaxNumElements(VecTypeIdx, S32, 32)
1407       // TODO: Clamp elements for 64-bit vectors?
1408       // It should only be necessary with variable indexes.
1409       // As a last resort, lower to the stack
1410       .lower();
1411   }
1412 
1413   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1414     .unsupportedIf([=](const LegalityQuery &Query) {
1415         const LLT &EltTy = Query.Types[1].getElementType();
1416         return Query.Types[0] != EltTy;
1417       });
1418 
1419   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1420     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1421     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1422 
1423     // FIXME: Doesn't handle extract of illegal sizes.
1424     getActionDefinitionsBuilder(Op)
1425       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1426       // FIXME: Multiples of 16 should not be legal.
1427       .legalIf([=](const LegalityQuery &Query) {
1428           const LLT BigTy = Query.Types[BigTyIdx];
1429           const LLT LitTy = Query.Types[LitTyIdx];
1430           return (BigTy.getSizeInBits() % 32 == 0) &&
1431                  (LitTy.getSizeInBits() % 16 == 0);
1432         })
1433       .widenScalarIf(
1434         [=](const LegalityQuery &Query) {
1435           const LLT BigTy = Query.Types[BigTyIdx];
1436           return (BigTy.getScalarSizeInBits() < 16);
1437         },
1438         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1439       .widenScalarIf(
1440         [=](const LegalityQuery &Query) {
1441           const LLT LitTy = Query.Types[LitTyIdx];
1442           return (LitTy.getScalarSizeInBits() < 16);
1443         },
1444         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1445       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1446       .widenScalarToNextPow2(BigTyIdx, 32);
1447 
1448   }
1449 
1450   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1451     .legalForCartesianProduct(AllS32Vectors, {S32})
1452     .legalForCartesianProduct(AllS64Vectors, {S64})
1453     .clampNumElements(0, V16S32, V32S32)
1454     .clampNumElements(0, V2S64, V16S64)
1455     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1456 
1457   if (ST.hasScalarPackInsts()) {
1458     BuildVector
1459       // FIXME: Should probably widen s1 vectors straight to s32
1460       .minScalarOrElt(0, S16)
1461       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1462       .minScalar(1, S32);
1463 
1464     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1465       .legalFor({V2S16, S32})
1466       .lower();
1467     BuildVector.minScalarOrElt(0, S32);
1468   } else {
1469     BuildVector.customFor({V2S16, S16});
1470     BuildVector.minScalarOrElt(0, S32);
1471 
1472     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1473       .customFor({V2S16, S32})
1474       .lower();
1475   }
1476 
1477   BuildVector.legalIf(isRegisterType(0));
1478 
1479   // FIXME: Clamp maximum size
1480   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1481     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1482     .clampMaxNumElements(0, S32, 32)
1483     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1484     .clampMaxNumElements(0, S16, 64);
1485 
1486   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1487   // pre-legalize.
1488   if (ST.hasVOP3PInsts()) {
1489     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1490       .customFor({V2S16, V2S16})
1491       .lower();
1492   } else
1493     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1494 
1495   // Merge/Unmerge
1496   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1497     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1498     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1499 
1500     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1501       const LLT Ty = Query.Types[TypeIdx];
1502       if (Ty.isVector()) {
1503         const LLT &EltTy = Ty.getElementType();
1504         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1505           return true;
1506         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1507           return true;
1508       }
1509       return false;
1510     };
1511 
1512     auto &Builder = getActionDefinitionsBuilder(Op)
1513       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1514       .lowerFor({{S16, V2S16}})
1515       .lowerIf([=](const LegalityQuery &Query) {
1516           const LLT BigTy = Query.Types[BigTyIdx];
1517           return BigTy.getSizeInBits() == 32;
1518         })
1519       // Try to widen to s16 first for small types.
1520       // TODO: Only do this on targets with legal s16 shifts
1521       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1522       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1523       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1524       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1525                            elementTypeIs(1, S16)),
1526                        changeTo(1, V2S16))
1527       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1528       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1529       // valid.
1530       .clampScalar(LitTyIdx, S32, S512)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1532       // Break up vectors with weird elements into scalars
1533       .fewerElementsIf(
1534         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1535         scalarize(0))
1536       .fewerElementsIf(
1537         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1538         scalarize(1))
1539       .clampScalar(BigTyIdx, S32, MaxScalar);
1540 
1541     if (Op == G_MERGE_VALUES) {
1542       Builder.widenScalarIf(
1543         // TODO: Use 16-bit shifts if legal for 8-bit values?
1544         [=](const LegalityQuery &Query) {
1545           const LLT Ty = Query.Types[LitTyIdx];
1546           return Ty.getSizeInBits() < 32;
1547         },
1548         changeTo(LitTyIdx, S32));
1549     }
1550 
1551     Builder.widenScalarIf(
1552       [=](const LegalityQuery &Query) {
1553         const LLT Ty = Query.Types[BigTyIdx];
1554         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1555           Ty.getSizeInBits() % 16 != 0;
1556       },
1557       [=](const LegalityQuery &Query) {
1558         // Pick the next power of 2, or a multiple of 64 over 128.
1559         // Whichever is smaller.
1560         const LLT &Ty = Query.Types[BigTyIdx];
1561         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1562         if (NewSizeInBits >= 256) {
1563           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1564           if (RoundedTo < NewSizeInBits)
1565             NewSizeInBits = RoundedTo;
1566         }
1567         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1568       })
1569       // Any vectors left are the wrong size. Scalarize them.
1570       .scalarize(0)
1571       .scalarize(1);
1572   }
1573 
1574   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1575   // RegBankSelect.
1576   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1577     .legalFor({{S32}, {S64}});
1578 
1579   if (ST.hasVOP3PInsts()) {
1580     SextInReg.lowerFor({{V2S16}})
1581       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1582       // get more vector shift opportunities, since we'll get those when
1583       // expanded.
1584       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1585   } else if (ST.has16BitInsts()) {
1586     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1587   } else {
1588     // Prefer to promote to s32 before lowering if we don't have 16-bit
1589     // shifts. This avoid a lot of intermediate truncate and extend operations.
1590     SextInReg.lowerFor({{S32}, {S64}});
1591   }
1592 
1593   SextInReg
1594     .scalarize(0)
1595     .clampScalar(0, S32, S64)
1596     .lower();
1597 
1598   getActionDefinitionsBuilder(G_FSHR)
1599     .legalFor({{S32, S32}})
1600     .scalarize(0)
1601     .lower();
1602 
1603   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1604     .legalFor({S64});
1605 
1606   getActionDefinitionsBuilder(G_FENCE)
1607     .alwaysLegal();
1608 
1609   getActionDefinitionsBuilder({
1610       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1611       G_FCOPYSIGN,
1612 
1613       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1614       G_ATOMICRMW_NAND,
1615       G_ATOMICRMW_FSUB,
1616       G_READ_REGISTER,
1617       G_WRITE_REGISTER,
1618 
1619       G_SADDO, G_SSUBO,
1620 
1621        // TODO: Implement
1622       G_FMINIMUM, G_FMAXIMUM,
1623       G_FSHL
1624     }).lower();
1625 
1626   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1627         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1628         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1629     .unsupported();
1630 
1631   computeTables();
1632   verify(*ST.getInstrInfo());
1633 }
1634 
1635 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1636                                          MachineInstr &MI) const {
1637   MachineIRBuilder &B = Helper.MIRBuilder;
1638   MachineRegisterInfo &MRI = *B.getMRI();
1639 
1640   switch (MI.getOpcode()) {
1641   case TargetOpcode::G_ADDRSPACE_CAST:
1642     return legalizeAddrSpaceCast(MI, MRI, B);
1643   case TargetOpcode::G_FRINT:
1644     return legalizeFrint(MI, MRI, B);
1645   case TargetOpcode::G_FCEIL:
1646     return legalizeFceil(MI, MRI, B);
1647   case TargetOpcode::G_FREM:
1648     return legalizeFrem(MI, MRI, B);
1649   case TargetOpcode::G_INTRINSIC_TRUNC:
1650     return legalizeIntrinsicTrunc(MI, MRI, B);
1651   case TargetOpcode::G_SITOFP:
1652     return legalizeITOFP(MI, MRI, B, true);
1653   case TargetOpcode::G_UITOFP:
1654     return legalizeITOFP(MI, MRI, B, false);
1655   case TargetOpcode::G_FPTOSI:
1656     return legalizeFPTOI(MI, MRI, B, true);
1657   case TargetOpcode::G_FPTOUI:
1658     return legalizeFPTOI(MI, MRI, B, false);
1659   case TargetOpcode::G_FMINNUM:
1660   case TargetOpcode::G_FMAXNUM:
1661   case TargetOpcode::G_FMINNUM_IEEE:
1662   case TargetOpcode::G_FMAXNUM_IEEE:
1663     return legalizeMinNumMaxNum(Helper, MI);
1664   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1665     return legalizeExtractVectorElt(MI, MRI, B);
1666   case TargetOpcode::G_INSERT_VECTOR_ELT:
1667     return legalizeInsertVectorElt(MI, MRI, B);
1668   case TargetOpcode::G_SHUFFLE_VECTOR:
1669     return legalizeShuffleVector(MI, MRI, B);
1670   case TargetOpcode::G_FSIN:
1671   case TargetOpcode::G_FCOS:
1672     return legalizeSinCos(MI, MRI, B);
1673   case TargetOpcode::G_GLOBAL_VALUE:
1674     return legalizeGlobalValue(MI, MRI, B);
1675   case TargetOpcode::G_LOAD:
1676     return legalizeLoad(Helper, MI);
1677   case TargetOpcode::G_FMAD:
1678     return legalizeFMad(MI, MRI, B);
1679   case TargetOpcode::G_FDIV:
1680     return legalizeFDIV(MI, MRI, B);
1681   case TargetOpcode::G_UDIV:
1682   case TargetOpcode::G_UREM:
1683     return legalizeUDIV_UREM(MI, MRI, B);
1684   case TargetOpcode::G_SDIV:
1685   case TargetOpcode::G_SREM:
1686     return legalizeSDIV_SREM(MI, MRI, B);
1687   case TargetOpcode::G_ATOMIC_CMPXCHG:
1688     return legalizeAtomicCmpXChg(MI, MRI, B);
1689   case TargetOpcode::G_FLOG:
1690     return legalizeFlog(MI, B, numbers::ln2f);
1691   case TargetOpcode::G_FLOG10:
1692     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1693   case TargetOpcode::G_FEXP:
1694     return legalizeFExp(MI, B);
1695   case TargetOpcode::G_FPOW:
1696     return legalizeFPow(MI, B);
1697   case TargetOpcode::G_FFLOOR:
1698     return legalizeFFloor(MI, MRI, B);
1699   case TargetOpcode::G_BUILD_VECTOR:
1700     return legalizeBuildVector(MI, MRI, B);
1701   default:
1702     return false;
1703   }
1704 
1705   llvm_unreachable("expected switch to return");
1706 }
1707 
1708 Register AMDGPULegalizerInfo::getSegmentAperture(
1709   unsigned AS,
1710   MachineRegisterInfo &MRI,
1711   MachineIRBuilder &B) const {
1712   MachineFunction &MF = B.getMF();
1713   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1714   const LLT S32 = LLT::scalar(32);
1715 
1716   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1717 
1718   if (ST.hasApertureRegs()) {
1719     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1720     // getreg.
1721     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1722         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1723         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1724     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1725         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1726         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1727     unsigned Encoding =
1728         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1729         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1730         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1731 
1732     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1733 
1734     B.buildInstr(AMDGPU::S_GETREG_B32)
1735       .addDef(GetReg)
1736       .addImm(Encoding);
1737     MRI.setType(GetReg, S32);
1738 
1739     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1740     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1741   }
1742 
1743   Register QueuePtr = MRI.createGenericVirtualRegister(
1744     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1745 
1746   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1747     return Register();
1748 
1749   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1750   // private_segment_aperture_base_hi.
1751   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1752 
1753   // TODO: can we be smarter about machine pointer info?
1754   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1755   MachineMemOperand *MMO = MF.getMachineMemOperand(
1756       PtrInfo,
1757       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1758           MachineMemOperand::MOInvariant,
1759       4, commonAlignment(Align(64), StructOffset));
1760 
1761   Register LoadAddr;
1762 
1763   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1764   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1765 }
1766 
1767 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1768   MachineInstr &MI, MachineRegisterInfo &MRI,
1769   MachineIRBuilder &B) const {
1770   MachineFunction &MF = B.getMF();
1771 
1772   const LLT S32 = LLT::scalar(32);
1773   Register Dst = MI.getOperand(0).getReg();
1774   Register Src = MI.getOperand(1).getReg();
1775 
1776   LLT DstTy = MRI.getType(Dst);
1777   LLT SrcTy = MRI.getType(Src);
1778   unsigned DestAS = DstTy.getAddressSpace();
1779   unsigned SrcAS = SrcTy.getAddressSpace();
1780 
1781   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1782   // vector element.
1783   assert(!DstTy.isVector());
1784 
1785   const AMDGPUTargetMachine &TM
1786     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1787 
1788   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1789     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1790     return true;
1791   }
1792 
1793   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1794     // Truncate.
1795     B.buildExtract(Dst, Src, 0);
1796     MI.eraseFromParent();
1797     return true;
1798   }
1799 
1800   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1801     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1802     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1803 
1804     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1805     // another. Merge operands are required to be the same type, but creating an
1806     // extra ptrtoint would be kind of pointless.
1807     auto HighAddr = B.buildConstant(
1808       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1809     B.buildMerge(Dst, {Src, HighAddr});
1810     MI.eraseFromParent();
1811     return true;
1812   }
1813 
1814   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1815     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1816            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1817     unsigned NullVal = TM.getNullPointerValue(DestAS);
1818 
1819     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1820     auto FlatNull = B.buildConstant(SrcTy, 0);
1821 
1822     // Extract low 32-bits of the pointer.
1823     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1824 
1825     auto CmpRes =
1826         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1827     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1828 
1829     MI.eraseFromParent();
1830     return true;
1831   }
1832 
1833   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1834     return false;
1835 
1836   if (!ST.hasFlatAddressSpace())
1837     return false;
1838 
1839   auto SegmentNull =
1840       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1841   auto FlatNull =
1842       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1843 
1844   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1845   if (!ApertureReg.isValid())
1846     return false;
1847 
1848   auto CmpRes =
1849       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1850 
1851   // Coerce the type of the low half of the result so we can use merge_values.
1852   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1853 
1854   // TODO: Should we allow mismatched types but matching sizes in merges to
1855   // avoid the ptrtoint?
1856   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1857   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1858 
1859   MI.eraseFromParent();
1860   return true;
1861 }
1862 
1863 bool AMDGPULegalizerInfo::legalizeFrint(
1864   MachineInstr &MI, MachineRegisterInfo &MRI,
1865   MachineIRBuilder &B) const {
1866   Register Src = MI.getOperand(1).getReg();
1867   LLT Ty = MRI.getType(Src);
1868   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1869 
1870   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1871   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1872 
1873   auto C1 = B.buildFConstant(Ty, C1Val);
1874   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1875 
1876   // TODO: Should this propagate fast-math-flags?
1877   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1878   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1879 
1880   auto C2 = B.buildFConstant(Ty, C2Val);
1881   auto Fabs = B.buildFAbs(Ty, Src);
1882 
1883   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1884   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1885   MI.eraseFromParent();
1886   return true;
1887 }
1888 
1889 bool AMDGPULegalizerInfo::legalizeFceil(
1890   MachineInstr &MI, MachineRegisterInfo &MRI,
1891   MachineIRBuilder &B) const {
1892 
1893   const LLT S1 = LLT::scalar(1);
1894   const LLT S64 = LLT::scalar(64);
1895 
1896   Register Src = MI.getOperand(1).getReg();
1897   assert(MRI.getType(Src) == S64);
1898 
1899   // result = trunc(src)
1900   // if (src > 0.0 && src != result)
1901   //   result += 1.0
1902 
1903   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1904 
1905   const auto Zero = B.buildFConstant(S64, 0.0);
1906   const auto One = B.buildFConstant(S64, 1.0);
1907   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1908   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1909   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1910   auto Add = B.buildSelect(S64, And, One, Zero);
1911 
1912   // TODO: Should this propagate fast-math-flags?
1913   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1914   return true;
1915 }
1916 
1917 bool AMDGPULegalizerInfo::legalizeFrem(
1918   MachineInstr &MI, MachineRegisterInfo &MRI,
1919   MachineIRBuilder &B) const {
1920     Register DstReg = MI.getOperand(0).getReg();
1921     Register Src0Reg = MI.getOperand(1).getReg();
1922     Register Src1Reg = MI.getOperand(2).getReg();
1923     auto Flags = MI.getFlags();
1924     LLT Ty = MRI.getType(DstReg);
1925 
1926     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1927     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1928     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1929     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1930     MI.eraseFromParent();
1931     return true;
1932 }
1933 
1934 static MachineInstrBuilder extractF64Exponent(Register Hi,
1935                                               MachineIRBuilder &B) {
1936   const unsigned FractBits = 52;
1937   const unsigned ExpBits = 11;
1938   LLT S32 = LLT::scalar(32);
1939 
1940   auto Const0 = B.buildConstant(S32, FractBits - 32);
1941   auto Const1 = B.buildConstant(S32, ExpBits);
1942 
1943   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1944     .addUse(Hi)
1945     .addUse(Const0.getReg(0))
1946     .addUse(Const1.getReg(0));
1947 
1948   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1949 }
1950 
1951 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1952   MachineInstr &MI, MachineRegisterInfo &MRI,
1953   MachineIRBuilder &B) const {
1954   const LLT S1 = LLT::scalar(1);
1955   const LLT S32 = LLT::scalar(32);
1956   const LLT S64 = LLT::scalar(64);
1957 
1958   Register Src = MI.getOperand(1).getReg();
1959   assert(MRI.getType(Src) == S64);
1960 
1961   // TODO: Should this use extract since the low half is unused?
1962   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1963   Register Hi = Unmerge.getReg(1);
1964 
1965   // Extract the upper half, since this is where we will find the sign and
1966   // exponent.
1967   auto Exp = extractF64Exponent(Hi, B);
1968 
1969   const unsigned FractBits = 52;
1970 
1971   // Extract the sign bit.
1972   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1973   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1974 
1975   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1976 
1977   const auto Zero32 = B.buildConstant(S32, 0);
1978 
1979   // Extend back to 64-bits.
1980   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1981 
1982   auto Shr = B.buildAShr(S64, FractMask, Exp);
1983   auto Not = B.buildNot(S64, Shr);
1984   auto Tmp0 = B.buildAnd(S64, Src, Not);
1985   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1986 
1987   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1988   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1989 
1990   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1991   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1992   MI.eraseFromParent();
1993   return true;
1994 }
1995 
1996 bool AMDGPULegalizerInfo::legalizeITOFP(
1997   MachineInstr &MI, MachineRegisterInfo &MRI,
1998   MachineIRBuilder &B, bool Signed) const {
1999 
2000   Register Dst = MI.getOperand(0).getReg();
2001   Register Src = MI.getOperand(1).getReg();
2002 
2003   const LLT S64 = LLT::scalar(64);
2004   const LLT S32 = LLT::scalar(32);
2005 
2006   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2007 
2008   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2009 
2010   auto CvtHi = Signed ?
2011     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2012     B.buildUITOFP(S64, Unmerge.getReg(1));
2013 
2014   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2015 
2016   auto ThirtyTwo = B.buildConstant(S32, 32);
2017   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2018     .addUse(CvtHi.getReg(0))
2019     .addUse(ThirtyTwo.getReg(0));
2020 
2021   // TODO: Should this propagate fast-math-flags?
2022   B.buildFAdd(Dst, LdExp, CvtLo);
2023   MI.eraseFromParent();
2024   return true;
2025 }
2026 
2027 // TODO: Copied from DAG implementation. Verify logic and document how this
2028 // actually works.
2029 bool AMDGPULegalizerInfo::legalizeFPTOI(
2030   MachineInstr &MI, MachineRegisterInfo &MRI,
2031   MachineIRBuilder &B, bool Signed) const {
2032 
2033   Register Dst = MI.getOperand(0).getReg();
2034   Register Src = MI.getOperand(1).getReg();
2035 
2036   const LLT S64 = LLT::scalar(64);
2037   const LLT S32 = LLT::scalar(32);
2038 
2039   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2040 
2041   unsigned Flags = MI.getFlags();
2042 
2043   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2044   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2045   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2046 
2047   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2048   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2049   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2050 
2051   auto Hi = Signed ?
2052     B.buildFPTOSI(S32, FloorMul) :
2053     B.buildFPTOUI(S32, FloorMul);
2054   auto Lo = B.buildFPTOUI(S32, Fma);
2055 
2056   B.buildMerge(Dst, { Lo, Hi });
2057   MI.eraseFromParent();
2058 
2059   return true;
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2063                                                MachineInstr &MI) const {
2064   MachineFunction &MF = Helper.MIRBuilder.getMF();
2065   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2066 
2067   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2068                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2069 
2070   // With ieee_mode disabled, the instructions have the correct behavior
2071   // already for G_FMINNUM/G_FMAXNUM
2072   if (!MFI->getMode().IEEE)
2073     return !IsIEEEOp;
2074 
2075   if (IsIEEEOp)
2076     return true;
2077 
2078   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2079 }
2080 
2081 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2082   MachineInstr &MI, MachineRegisterInfo &MRI,
2083   MachineIRBuilder &B) const {
2084   // TODO: Should move some of this into LegalizerHelper.
2085 
2086   // TODO: Promote dynamic indexing of s16 to s32
2087 
2088   // FIXME: Artifact combiner probably should have replaced the truncated
2089   // constant before this, so we shouldn't need
2090   // getConstantVRegValWithLookThrough.
2091   Optional<ValueAndVReg> MaybeIdxVal =
2092       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2093   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2094     return true;
2095   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2096 
2097   Register Dst = MI.getOperand(0).getReg();
2098   Register Vec = MI.getOperand(1).getReg();
2099 
2100   LLT VecTy = MRI.getType(Vec);
2101   LLT EltTy = VecTy.getElementType();
2102   assert(EltTy == MRI.getType(Dst));
2103 
2104   if (IdxVal < VecTy.getNumElements())
2105     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2106   else
2107     B.buildUndef(Dst);
2108 
2109   MI.eraseFromParent();
2110   return true;
2111 }
2112 
2113 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2114   MachineInstr &MI, MachineRegisterInfo &MRI,
2115   MachineIRBuilder &B) const {
2116   // TODO: Should move some of this into LegalizerHelper.
2117 
2118   // TODO: Promote dynamic indexing of s16 to s32
2119 
2120   // FIXME: Artifact combiner probably should have replaced the truncated
2121   // constant before this, so we shouldn't need
2122   // getConstantVRegValWithLookThrough.
2123   Optional<ValueAndVReg> MaybeIdxVal =
2124       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2125   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2126     return true;
2127 
2128   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2129   Register Dst = MI.getOperand(0).getReg();
2130   Register Vec = MI.getOperand(1).getReg();
2131   Register Ins = MI.getOperand(2).getReg();
2132 
2133   LLT VecTy = MRI.getType(Vec);
2134   LLT EltTy = VecTy.getElementType();
2135   assert(EltTy == MRI.getType(Ins));
2136 
2137   if (IdxVal < VecTy.getNumElements())
2138     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2139   else
2140     B.buildUndef(Dst);
2141 
2142   MI.eraseFromParent();
2143   return true;
2144 }
2145 
2146 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2147   MachineInstr &MI, MachineRegisterInfo &MRI,
2148   MachineIRBuilder &B) const {
2149   const LLT V2S16 = LLT::vector(2, 16);
2150 
2151   Register Dst = MI.getOperand(0).getReg();
2152   Register Src0 = MI.getOperand(1).getReg();
2153   LLT DstTy = MRI.getType(Dst);
2154   LLT SrcTy = MRI.getType(Src0);
2155 
2156   if (SrcTy == V2S16 && DstTy == V2S16 &&
2157       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2158     return true;
2159 
2160   MachineIRBuilder HelperBuilder(MI);
2161   GISelObserverWrapper DummyObserver;
2162   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2163   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2164 }
2165 
2166 bool AMDGPULegalizerInfo::legalizeSinCos(
2167   MachineInstr &MI, MachineRegisterInfo &MRI,
2168   MachineIRBuilder &B) const {
2169 
2170   Register DstReg = MI.getOperand(0).getReg();
2171   Register SrcReg = MI.getOperand(1).getReg();
2172   LLT Ty = MRI.getType(DstReg);
2173   unsigned Flags = MI.getFlags();
2174 
2175   Register TrigVal;
2176   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2177   if (ST.hasTrigReducedRange()) {
2178     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2179     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2180       .addUse(MulVal.getReg(0))
2181       .setMIFlags(Flags).getReg(0);
2182   } else
2183     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2184 
2185   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2186     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2187   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2188     .addUse(TrigVal)
2189     .setMIFlags(Flags);
2190   MI.eraseFromParent();
2191   return true;
2192 }
2193 
2194 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2195                                                   MachineIRBuilder &B,
2196                                                   const GlobalValue *GV,
2197                                                   int64_t Offset,
2198                                                   unsigned GAFlags) const {
2199   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2200   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2201   // to the following code sequence:
2202   //
2203   // For constant address space:
2204   //   s_getpc_b64 s[0:1]
2205   //   s_add_u32 s0, s0, $symbol
2206   //   s_addc_u32 s1, s1, 0
2207   //
2208   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2209   //   a fixup or relocation is emitted to replace $symbol with a literal
2210   //   constant, which is a pc-relative offset from the encoding of the $symbol
2211   //   operand to the global variable.
2212   //
2213   // For global address space:
2214   //   s_getpc_b64 s[0:1]
2215   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2216   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2217   //
2218   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2219   //   fixups or relocations are emitted to replace $symbol@*@lo and
2220   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2221   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2222   //   operand to the global variable.
2223   //
2224   // What we want here is an offset from the value returned by s_getpc
2225   // (which is the address of the s_add_u32 instruction) to the global
2226   // variable, but since the encoding of $symbol starts 4 bytes after the start
2227   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2228   // small. This requires us to add 4 to the global variable offset in order to
2229   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2230   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2231   // instruction.
2232 
2233   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2234 
2235   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2236     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2237 
2238   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2239     .addDef(PCReg);
2240 
2241   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2242   if (GAFlags == SIInstrInfo::MO_NONE)
2243     MIB.addImm(0);
2244   else
2245     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2246 
2247   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2248 
2249   if (PtrTy.getSizeInBits() == 32)
2250     B.buildExtract(DstReg, PCReg, 0);
2251   return true;
2252  }
2253 
2254 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2255   MachineInstr &MI, MachineRegisterInfo &MRI,
2256   MachineIRBuilder &B) const {
2257   Register DstReg = MI.getOperand(0).getReg();
2258   LLT Ty = MRI.getType(DstReg);
2259   unsigned AS = Ty.getAddressSpace();
2260 
2261   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2262   MachineFunction &MF = B.getMF();
2263   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2264 
2265   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2266     if (!MFI->isModuleEntryFunction()) {
2267       const Function &Fn = MF.getFunction();
2268       DiagnosticInfoUnsupported BadLDSDecl(
2269         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2270         DS_Warning);
2271       Fn.getContext().diagnose(BadLDSDecl);
2272 
2273       // We currently don't have a way to correctly allocate LDS objects that
2274       // aren't directly associated with a kernel. We do force inlining of
2275       // functions that use local objects. However, if these dead functions are
2276       // not eliminated, we don't want a compile time error. Just emit a warning
2277       // and a trap, since there should be no callable path here.
2278       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2279       B.buildUndef(DstReg);
2280       MI.eraseFromParent();
2281       return true;
2282     }
2283 
2284     // TODO: We could emit code to handle the initialization somewhere.
2285     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2286       const SITargetLowering *TLI = ST.getTargetLowering();
2287       if (!TLI->shouldUseLDSConstAddress(GV)) {
2288         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2289         return true; // Leave in place;
2290       }
2291 
2292       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2293         Type *Ty = GV->getValueType();
2294         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2295         // zero-sized type in other languages to declare the dynamic shared
2296         // memory which size is not known at the compile time. They will be
2297         // allocated by the runtime and placed directly after the static
2298         // allocated ones. They all share the same offset.
2299         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2300           // Adjust alignment for that dynamic shared memory array.
2301           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2302           LLT S32 = LLT::scalar(32);
2303           auto Sz =
2304               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2305           B.buildIntToPtr(DstReg, Sz);
2306           MI.eraseFromParent();
2307           return true;
2308         }
2309       }
2310 
2311       B.buildConstant(
2312           DstReg,
2313           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2314       MI.eraseFromParent();
2315       return true;
2316     }
2317 
2318     const Function &Fn = MF.getFunction();
2319     DiagnosticInfoUnsupported BadInit(
2320       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2321     Fn.getContext().diagnose(BadInit);
2322     return true;
2323   }
2324 
2325   const SITargetLowering *TLI = ST.getTargetLowering();
2326 
2327   if (TLI->shouldEmitFixup(GV)) {
2328     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2329     MI.eraseFromParent();
2330     return true;
2331   }
2332 
2333   if (TLI->shouldEmitPCReloc(GV)) {
2334     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2335     MI.eraseFromParent();
2336     return true;
2337   }
2338 
2339   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2340   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2341 
2342   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2343       MachinePointerInfo::getGOT(MF),
2344       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2345           MachineMemOperand::MOInvariant,
2346       8 /*Size*/, Align(8));
2347 
2348   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2349 
2350   if (Ty.getSizeInBits() == 32) {
2351     // Truncate if this is a 32-bit constant adrdess.
2352     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2353     B.buildExtract(DstReg, Load, 0);
2354   } else
2355     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2356 
2357   MI.eraseFromParent();
2358   return true;
2359 }
2360 
2361 static LLT widenToNextPowerOf2(LLT Ty) {
2362   if (Ty.isVector())
2363     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2364   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2365 }
2366 
2367 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2368                                        MachineInstr &MI) const {
2369   MachineIRBuilder &B = Helper.MIRBuilder;
2370   MachineRegisterInfo &MRI = *B.getMRI();
2371   GISelChangeObserver &Observer = Helper.Observer;
2372 
2373   Register PtrReg = MI.getOperand(1).getReg();
2374   LLT PtrTy = MRI.getType(PtrReg);
2375   unsigned AddrSpace = PtrTy.getAddressSpace();
2376 
2377   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2378     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2379     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2380     Observer.changingInstr(MI);
2381     MI.getOperand(1).setReg(Cast.getReg(0));
2382     Observer.changedInstr(MI);
2383     return true;
2384   }
2385 
2386   Register ValReg = MI.getOperand(0).getReg();
2387   LLT ValTy = MRI.getType(ValReg);
2388 
2389   MachineMemOperand *MMO = *MI.memoperands_begin();
2390   const unsigned ValSize = ValTy.getSizeInBits();
2391   const unsigned MemSize = 8 * MMO->getSize();
2392   const Align MemAlign = MMO->getAlign();
2393   const unsigned AlignInBits = 8 * MemAlign.value();
2394 
2395   // Widen non-power-of-2 loads to the alignment if needed
2396   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2397     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2398 
2399     // This was already the correct extending load result type, so just adjust
2400     // the memory type.
2401     if (WideMemSize == ValSize) {
2402       MachineFunction &MF = B.getMF();
2403 
2404       MachineMemOperand *WideMMO =
2405           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2406       Observer.changingInstr(MI);
2407       MI.setMemRefs(MF, {WideMMO});
2408       Observer.changedInstr(MI);
2409       return true;
2410     }
2411 
2412     // Don't bother handling edge case that should probably never be produced.
2413     if (ValSize > WideMemSize)
2414       return false;
2415 
2416     LLT WideTy = widenToNextPowerOf2(ValTy);
2417 
2418     Register WideLoad;
2419     if (!WideTy.isVector()) {
2420       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2421       B.buildTrunc(ValReg, WideLoad).getReg(0);
2422     } else {
2423       // Extract the subvector.
2424 
2425       if (isRegisterType(ValTy)) {
2426         // If this a case where G_EXTRACT is legal, use it.
2427         // (e.g. <3 x s32> -> <4 x s32>)
2428         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2429         B.buildExtract(ValReg, WideLoad, 0);
2430       } else {
2431         // For cases where the widened type isn't a nice register value, unmerge
2432         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2433         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2434         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2435         B.setInsertPt(B.getMBB(), MI.getIterator());
2436         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2437       }
2438     }
2439 
2440     MI.eraseFromParent();
2441     return true;
2442   }
2443 
2444   return false;
2445 }
2446 
2447 bool AMDGPULegalizerInfo::legalizeFMad(
2448   MachineInstr &MI, MachineRegisterInfo &MRI,
2449   MachineIRBuilder &B) const {
2450   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2451   assert(Ty.isScalar());
2452 
2453   MachineFunction &MF = B.getMF();
2454   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2455 
2456   // TODO: Always legal with future ftz flag.
2457   // FIXME: Do we need just output?
2458   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2459     return true;
2460   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2461     return true;
2462 
2463   MachineIRBuilder HelperBuilder(MI);
2464   GISelObserverWrapper DummyObserver;
2465   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2466   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2467 }
2468 
2469 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2470   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2471   Register DstReg = MI.getOperand(0).getReg();
2472   Register PtrReg = MI.getOperand(1).getReg();
2473   Register CmpVal = MI.getOperand(2).getReg();
2474   Register NewVal = MI.getOperand(3).getReg();
2475 
2476   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2477          "this should not have been custom lowered");
2478 
2479   LLT ValTy = MRI.getType(CmpVal);
2480   LLT VecTy = LLT::vector(2, ValTy);
2481 
2482   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2483 
2484   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2485     .addDef(DstReg)
2486     .addUse(PtrReg)
2487     .addUse(PackedVal)
2488     .setMemRefs(MI.memoperands());
2489 
2490   MI.eraseFromParent();
2491   return true;
2492 }
2493 
2494 bool AMDGPULegalizerInfo::legalizeFlog(
2495   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2496   Register Dst = MI.getOperand(0).getReg();
2497   Register Src = MI.getOperand(1).getReg();
2498   LLT Ty = B.getMRI()->getType(Dst);
2499   unsigned Flags = MI.getFlags();
2500 
2501   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2502   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2503 
2504   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2505   MI.eraseFromParent();
2506   return true;
2507 }
2508 
2509 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2510                                        MachineIRBuilder &B) const {
2511   Register Dst = MI.getOperand(0).getReg();
2512   Register Src = MI.getOperand(1).getReg();
2513   unsigned Flags = MI.getFlags();
2514   LLT Ty = B.getMRI()->getType(Dst);
2515 
2516   auto K = B.buildFConstant(Ty, numbers::log2e);
2517   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2518   B.buildFExp2(Dst, Mul, Flags);
2519   MI.eraseFromParent();
2520   return true;
2521 }
2522 
2523 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2524                                        MachineIRBuilder &B) const {
2525   Register Dst = MI.getOperand(0).getReg();
2526   Register Src0 = MI.getOperand(1).getReg();
2527   Register Src1 = MI.getOperand(2).getReg();
2528   unsigned Flags = MI.getFlags();
2529   LLT Ty = B.getMRI()->getType(Dst);
2530   const LLT S16 = LLT::scalar(16);
2531   const LLT S32 = LLT::scalar(32);
2532 
2533   if (Ty == S32) {
2534     auto Log = B.buildFLog2(S32, Src0, Flags);
2535     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2536       .addUse(Log.getReg(0))
2537       .addUse(Src1)
2538       .setMIFlags(Flags);
2539     B.buildFExp2(Dst, Mul, Flags);
2540   } else if (Ty == S16) {
2541     // There's no f16 fmul_legacy, so we need to convert for it.
2542     auto Log = B.buildFLog2(S16, Src0, Flags);
2543     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2544     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2545     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2546       .addUse(Ext0.getReg(0))
2547       .addUse(Ext1.getReg(0))
2548       .setMIFlags(Flags);
2549 
2550     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2551   } else
2552     return false;
2553 
2554   MI.eraseFromParent();
2555   return true;
2556 }
2557 
2558 // Find a source register, ignoring any possible source modifiers.
2559 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2560   Register ModSrc = OrigSrc;
2561   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2562     ModSrc = SrcFNeg->getOperand(1).getReg();
2563     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2564       ModSrc = SrcFAbs->getOperand(1).getReg();
2565   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2566     ModSrc = SrcFAbs->getOperand(1).getReg();
2567   return ModSrc;
2568 }
2569 
2570 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2571                                          MachineRegisterInfo &MRI,
2572                                          MachineIRBuilder &B) const {
2573 
2574   const LLT S1 = LLT::scalar(1);
2575   const LLT S64 = LLT::scalar(64);
2576   Register Dst = MI.getOperand(0).getReg();
2577   Register OrigSrc = MI.getOperand(1).getReg();
2578   unsigned Flags = MI.getFlags();
2579   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2580          "this should not have been custom lowered");
2581 
2582   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2583   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2584   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2585   // V_FRACT bug is:
2586   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2587   //
2588   // Convert floor(x) to (x - fract(x))
2589 
2590   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2591     .addUse(OrigSrc)
2592     .setMIFlags(Flags);
2593 
2594   // Give source modifier matching some assistance before obscuring a foldable
2595   // pattern.
2596 
2597   // TODO: We can avoid the neg on the fract? The input sign to fract
2598   // shouldn't matter?
2599   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2600 
2601   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2602 
2603   Register Min = MRI.createGenericVirtualRegister(S64);
2604 
2605   // We don't need to concern ourselves with the snan handling difference, so
2606   // use the one which will directly select.
2607   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2608   if (MFI->getMode().IEEE)
2609     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2610   else
2611     B.buildFMinNum(Min, Fract, Const, Flags);
2612 
2613   Register CorrectedFract = Min;
2614   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2615     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2616     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2617   }
2618 
2619   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2620   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2621 
2622   MI.eraseFromParent();
2623   return true;
2624 }
2625 
2626 // Turn an illegal packed v2s16 build vector into bit operations.
2627 // TODO: This should probably be a bitcast action in LegalizerHelper.
2628 bool AMDGPULegalizerInfo::legalizeBuildVector(
2629   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2630   Register Dst = MI.getOperand(0).getReg();
2631   const LLT S32 = LLT::scalar(32);
2632   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2633 
2634   Register Src0 = MI.getOperand(1).getReg();
2635   Register Src1 = MI.getOperand(2).getReg();
2636   assert(MRI.getType(Src0) == LLT::scalar(16));
2637 
2638   auto Merge = B.buildMerge(S32, {Src0, Src1});
2639   B.buildBitcast(Dst, Merge);
2640 
2641   MI.eraseFromParent();
2642   return true;
2643 }
2644 
2645 // Check that this is a G_XOR x, -1
2646 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2647   if (MI.getOpcode() != TargetOpcode::G_XOR)
2648     return false;
2649   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2650   return ConstVal && *ConstVal == -1;
2651 }
2652 
2653 // Return the use branch instruction, otherwise null if the usage is invalid.
2654 static MachineInstr *
2655 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2656                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2657   Register CondDef = MI.getOperand(0).getReg();
2658   if (!MRI.hasOneNonDBGUse(CondDef))
2659     return nullptr;
2660 
2661   MachineBasicBlock *Parent = MI.getParent();
2662   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2663 
2664   if (isNot(MRI, *UseMI)) {
2665     Register NegatedCond = UseMI->getOperand(0).getReg();
2666     if (!MRI.hasOneNonDBGUse(NegatedCond))
2667       return nullptr;
2668 
2669     // We're deleting the def of this value, so we need to remove it.
2670     UseMI->eraseFromParent();
2671 
2672     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2673     Negated = true;
2674   }
2675 
2676   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2677     return nullptr;
2678 
2679   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2680   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2681   if (Next == Parent->end()) {
2682     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2683     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2684       return nullptr;
2685     UncondBrTarget = &*NextMBB;
2686   } else {
2687     if (Next->getOpcode() != AMDGPU::G_BR)
2688       return nullptr;
2689     Br = &*Next;
2690     UncondBrTarget = Br->getOperand(0).getMBB();
2691   }
2692 
2693   return UseMI;
2694 }
2695 
2696 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2697                                          const ArgDescriptor *Arg,
2698                                          const TargetRegisterClass *ArgRC,
2699                                          LLT ArgTy) const {
2700   MCRegister SrcReg = Arg->getRegister();
2701   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2702   assert(DstReg.isVirtual() && "Virtual register expected");
2703 
2704   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2705                                              ArgTy);
2706   if (Arg->isMasked()) {
2707     // TODO: Should we try to emit this once in the entry block?
2708     const LLT S32 = LLT::scalar(32);
2709     const unsigned Mask = Arg->getMask();
2710     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2711 
2712     Register AndMaskSrc = LiveIn;
2713 
2714     if (Shift != 0) {
2715       auto ShiftAmt = B.buildConstant(S32, Shift);
2716       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2717     }
2718 
2719     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2720   } else {
2721     B.buildCopy(DstReg, LiveIn);
2722   }
2723 
2724   return true;
2725 }
2726 
2727 bool AMDGPULegalizerInfo::loadInputValue(
2728     Register DstReg, MachineIRBuilder &B,
2729     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2730   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2731   const ArgDescriptor *Arg;
2732   const TargetRegisterClass *ArgRC;
2733   LLT ArgTy;
2734   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2735 
2736   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2737     return false; // TODO: Handle these
2738   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2739 }
2740 
2741 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2742     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2743     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2744   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2745     return false;
2746 
2747   MI.eraseFromParent();
2748   return true;
2749 }
2750 
2751 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2752                                        MachineRegisterInfo &MRI,
2753                                        MachineIRBuilder &B) const {
2754   Register Dst = MI.getOperand(0).getReg();
2755   LLT DstTy = MRI.getType(Dst);
2756   LLT S16 = LLT::scalar(16);
2757   LLT S32 = LLT::scalar(32);
2758   LLT S64 = LLT::scalar(64);
2759 
2760   if (DstTy == S16)
2761     return legalizeFDIV16(MI, MRI, B);
2762   if (DstTy == S32)
2763     return legalizeFDIV32(MI, MRI, B);
2764   if (DstTy == S64)
2765     return legalizeFDIV64(MI, MRI, B);
2766 
2767   return false;
2768 }
2769 
2770 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2771                                                   Register DstReg,
2772                                                   Register X,
2773                                                   Register Y,
2774                                                   bool IsDiv) const {
2775   const LLT S1 = LLT::scalar(1);
2776   const LLT S32 = LLT::scalar(32);
2777 
2778   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2779   // algorithm used here.
2780 
2781   // Initial estimate of inv(y).
2782   auto FloatY = B.buildUITOFP(S32, Y);
2783   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2784   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2785   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2786   auto Z = B.buildFPTOUI(S32, ScaledY);
2787 
2788   // One round of UNR.
2789   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2790   auto NegYZ = B.buildMul(S32, NegY, Z);
2791   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2792 
2793   // Quotient/remainder estimate.
2794   auto Q = B.buildUMulH(S32, X, Z);
2795   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2796 
2797   // First quotient/remainder refinement.
2798   auto One = B.buildConstant(S32, 1);
2799   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2800   if (IsDiv)
2801     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2802   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2803 
2804   // Second quotient/remainder refinement.
2805   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2806   if (IsDiv)
2807     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2808   else
2809     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2810 }
2811 
2812 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2813                                               MachineRegisterInfo &MRI,
2814                                               MachineIRBuilder &B) const {
2815   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2816   Register DstReg = MI.getOperand(0).getReg();
2817   Register Num = MI.getOperand(1).getReg();
2818   Register Den = MI.getOperand(2).getReg();
2819   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2820   MI.eraseFromParent();
2821   return true;
2822 }
2823 
2824 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2825 //
2826 // Return lo, hi of result
2827 //
2828 // %cvt.lo = G_UITOFP Val.lo
2829 // %cvt.hi = G_UITOFP Val.hi
2830 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2831 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2832 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2833 // %mul2 = G_FMUL %mul1, 2**(-32)
2834 // %trunc = G_INTRINSIC_TRUNC %mul2
2835 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2836 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2837 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2838                                                        Register Val) {
2839   const LLT S32 = LLT::scalar(32);
2840   auto Unmerge = B.buildUnmerge(S32, Val);
2841 
2842   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2843   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2844 
2845   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2846                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2847 
2848   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2849   auto Mul1 =
2850       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2851 
2852   // 2**(-32)
2853   auto Mul2 =
2854       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2855   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2856 
2857   // -(2**32)
2858   auto Mad2 = B.buildFMAD(S32, Trunc,
2859                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2860 
2861   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2862   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2863 
2864   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2865 }
2866 
2867 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2868                                                   Register DstReg,
2869                                                   Register Numer,
2870                                                   Register Denom,
2871                                                   bool IsDiv) const {
2872   const LLT S32 = LLT::scalar(32);
2873   const LLT S64 = LLT::scalar(64);
2874   const LLT S1 = LLT::scalar(1);
2875   Register RcpLo, RcpHi;
2876 
2877   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2878 
2879   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2880 
2881   auto Zero64 = B.buildConstant(S64, 0);
2882   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2883 
2884   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2885   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2886 
2887   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2888   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2889   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2890 
2891   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2892   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2893   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2894   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2895 
2896   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2897   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2898   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2899   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2900   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2901 
2902   auto Zero32 = B.buildConstant(S32, 0);
2903   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2904   auto Add2_HiC =
2905       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2906   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2907   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2908 
2909   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2910   Register NumerLo = UnmergeNumer.getReg(0);
2911   Register NumerHi = UnmergeNumer.getReg(1);
2912 
2913   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2914   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2915   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2916   Register Mul3_Lo = UnmergeMul3.getReg(0);
2917   Register Mul3_Hi = UnmergeMul3.getReg(1);
2918   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2919   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2920   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2921   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2922 
2923   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2924   Register DenomLo = UnmergeDenom.getReg(0);
2925   Register DenomHi = UnmergeDenom.getReg(1);
2926 
2927   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2928   auto C1 = B.buildSExt(S32, CmpHi);
2929 
2930   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2931   auto C2 = B.buildSExt(S32, CmpLo);
2932 
2933   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2934   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2935 
2936   // TODO: Here and below portions of the code can be enclosed into if/endif.
2937   // Currently control flow is unconditional and we have 4 selects after
2938   // potential endif to substitute PHIs.
2939 
2940   // if C3 != 0 ...
2941   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2942   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2943   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2944   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2945 
2946   auto One64 = B.buildConstant(S64, 1);
2947   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2948 
2949   auto C4 =
2950       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2951   auto C5 =
2952       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2953   auto C6 = B.buildSelect(
2954       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2955 
2956   // if (C6 != 0)
2957   auto Add4 = B.buildAdd(S64, Add3, One64);
2958   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2959 
2960   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2961   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2962   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2963 
2964   // endif C6
2965   // endif C3
2966 
2967   if (IsDiv) {
2968     auto Sel1 = B.buildSelect(
2969         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2970     B.buildSelect(DstReg,
2971                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2972   } else {
2973     auto Sel2 = B.buildSelect(
2974         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2975     B.buildSelect(DstReg,
2976                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2977   }
2978 }
2979 
2980 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2981                                             MachineRegisterInfo &MRI,
2982                                             MachineIRBuilder &B) const {
2983   const LLT S64 = LLT::scalar(64);
2984   const LLT S32 = LLT::scalar(32);
2985   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2986   Register DstReg = MI.getOperand(0).getReg();
2987   Register Num = MI.getOperand(1).getReg();
2988   Register Den = MI.getOperand(2).getReg();
2989   LLT Ty = MRI.getType(DstReg);
2990 
2991   if (Ty == S32)
2992     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2993   else if (Ty == S64)
2994     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2995   else
2996     return false;
2997 
2998   MI.eraseFromParent();
2999   return true;
3000 
3001 }
3002 
3003 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3004                                             MachineRegisterInfo &MRI,
3005                                             MachineIRBuilder &B) const {
3006   const LLT S64 = LLT::scalar(64);
3007   const LLT S32 = LLT::scalar(32);
3008 
3009   Register DstReg = MI.getOperand(0).getReg();
3010   const LLT Ty = MRI.getType(DstReg);
3011   if (Ty != S32 && Ty != S64)
3012     return false;
3013 
3014   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3015 
3016   Register LHS = MI.getOperand(1).getReg();
3017   Register RHS = MI.getOperand(2).getReg();
3018 
3019   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3020   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3021   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3022 
3023   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3024   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3025 
3026   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3027   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3028 
3029   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3030   if (Ty == S32)
3031     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3032   else
3033     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3034 
3035   Register Sign;
3036   if (IsDiv)
3037     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3038   else
3039     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3040 
3041   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3042   B.buildSub(DstReg, UDivRem, Sign);
3043 
3044   MI.eraseFromParent();
3045   return true;
3046 }
3047 
3048 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3049                                                  MachineRegisterInfo &MRI,
3050                                                  MachineIRBuilder &B) const {
3051   Register Res = MI.getOperand(0).getReg();
3052   Register LHS = MI.getOperand(1).getReg();
3053   Register RHS = MI.getOperand(2).getReg();
3054   uint16_t Flags = MI.getFlags();
3055   LLT ResTy = MRI.getType(Res);
3056 
3057   const MachineFunction &MF = B.getMF();
3058   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3059                             MI.getFlag(MachineInstr::FmAfn);
3060 
3061   if (!AllowInaccurateRcp)
3062     return false;
3063 
3064   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3065     // 1 / x -> RCP(x)
3066     if (CLHS->isExactlyValue(1.0)) {
3067       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3068         .addUse(RHS)
3069         .setMIFlags(Flags);
3070 
3071       MI.eraseFromParent();
3072       return true;
3073     }
3074 
3075     // -1 / x -> RCP( FNEG(x) )
3076     if (CLHS->isExactlyValue(-1.0)) {
3077       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3078       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3079         .addUse(FNeg.getReg(0))
3080         .setMIFlags(Flags);
3081 
3082       MI.eraseFromParent();
3083       return true;
3084     }
3085   }
3086 
3087   // x / y -> x * (1.0 / y)
3088   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3089     .addUse(RHS)
3090     .setMIFlags(Flags);
3091   B.buildFMul(Res, LHS, RCP, Flags);
3092 
3093   MI.eraseFromParent();
3094   return true;
3095 }
3096 
3097 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3098                                                    MachineRegisterInfo &MRI,
3099                                                    MachineIRBuilder &B) const {
3100   Register Res = MI.getOperand(0).getReg();
3101   Register X = MI.getOperand(1).getReg();
3102   Register Y = MI.getOperand(2).getReg();
3103   uint16_t Flags = MI.getFlags();
3104   LLT ResTy = MRI.getType(Res);
3105 
3106   const MachineFunction &MF = B.getMF();
3107   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3108                             MI.getFlag(MachineInstr::FmAfn);
3109 
3110   if (!AllowInaccurateRcp)
3111     return false;
3112 
3113   auto NegY = B.buildFNeg(ResTy, Y);
3114   auto One = B.buildFConstant(ResTy, 1.0);
3115 
3116   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3117     .addUse(Y)
3118     .setMIFlags(Flags);
3119 
3120   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3121   R = B.buildFMA(ResTy, Tmp0, R, R);
3122 
3123   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3124   R = B.buildFMA(ResTy, Tmp1, R, R);
3125 
3126   auto Ret = B.buildFMul(ResTy, X, R);
3127   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3128 
3129   B.buildFMA(Res, Tmp2, R, Ret);
3130   MI.eraseFromParent();
3131   return true;
3132 }
3133 
3134 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3135                                          MachineRegisterInfo &MRI,
3136                                          MachineIRBuilder &B) const {
3137   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3138     return true;
3139 
3140   Register Res = MI.getOperand(0).getReg();
3141   Register LHS = MI.getOperand(1).getReg();
3142   Register RHS = MI.getOperand(2).getReg();
3143 
3144   uint16_t Flags = MI.getFlags();
3145 
3146   LLT S16 = LLT::scalar(16);
3147   LLT S32 = LLT::scalar(32);
3148 
3149   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3150   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3151 
3152   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3153     .addUse(RHSExt.getReg(0))
3154     .setMIFlags(Flags);
3155 
3156   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3157   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3158 
3159   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3160     .addUse(RDst.getReg(0))
3161     .addUse(RHS)
3162     .addUse(LHS)
3163     .setMIFlags(Flags);
3164 
3165   MI.eraseFromParent();
3166   return true;
3167 }
3168 
3169 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3170 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3171 static void toggleSPDenormMode(bool Enable,
3172                                MachineIRBuilder &B,
3173                                const GCNSubtarget &ST,
3174                                AMDGPU::SIModeRegisterDefaults Mode) {
3175   // Set SP denorm mode to this value.
3176   unsigned SPDenormMode =
3177     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3178 
3179   if (ST.hasDenormModeInst()) {
3180     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3181     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3182 
3183     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3184     B.buildInstr(AMDGPU::S_DENORM_MODE)
3185       .addImm(NewDenormModeValue);
3186 
3187   } else {
3188     // Select FP32 bit field in mode register.
3189     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3190                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3191                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3192 
3193     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3194       .addImm(SPDenormMode)
3195       .addImm(SPDenormModeBitField);
3196   }
3197 }
3198 
3199 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3200                                          MachineRegisterInfo &MRI,
3201                                          MachineIRBuilder &B) const {
3202   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3203     return true;
3204 
3205   Register Res = MI.getOperand(0).getReg();
3206   Register LHS = MI.getOperand(1).getReg();
3207   Register RHS = MI.getOperand(2).getReg();
3208   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3209   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3210 
3211   uint16_t Flags = MI.getFlags();
3212 
3213   LLT S32 = LLT::scalar(32);
3214   LLT S1 = LLT::scalar(1);
3215 
3216   auto One = B.buildFConstant(S32, 1.0f);
3217 
3218   auto DenominatorScaled =
3219     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3220       .addUse(LHS)
3221       .addUse(RHS)
3222       .addImm(0)
3223       .setMIFlags(Flags);
3224   auto NumeratorScaled =
3225     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3226       .addUse(LHS)
3227       .addUse(RHS)
3228       .addImm(1)
3229       .setMIFlags(Flags);
3230 
3231   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3232     .addUse(DenominatorScaled.getReg(0))
3233     .setMIFlags(Flags);
3234   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3235 
3236   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3237   // aren't modeled as reading it.
3238   if (!Mode.allFP32Denormals())
3239     toggleSPDenormMode(true, B, ST, Mode);
3240 
3241   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3242   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3243   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3244   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3245   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3246   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3247 
3248   if (!Mode.allFP32Denormals())
3249     toggleSPDenormMode(false, B, ST, Mode);
3250 
3251   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3252     .addUse(Fma4.getReg(0))
3253     .addUse(Fma1.getReg(0))
3254     .addUse(Fma3.getReg(0))
3255     .addUse(NumeratorScaled.getReg(1))
3256     .setMIFlags(Flags);
3257 
3258   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3259     .addUse(Fmas.getReg(0))
3260     .addUse(RHS)
3261     .addUse(LHS)
3262     .setMIFlags(Flags);
3263 
3264   MI.eraseFromParent();
3265   return true;
3266 }
3267 
3268 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3269                                          MachineRegisterInfo &MRI,
3270                                          MachineIRBuilder &B) const {
3271   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3272     return true;
3273 
3274   Register Res = MI.getOperand(0).getReg();
3275   Register LHS = MI.getOperand(1).getReg();
3276   Register RHS = MI.getOperand(2).getReg();
3277 
3278   uint16_t Flags = MI.getFlags();
3279 
3280   LLT S64 = LLT::scalar(64);
3281   LLT S1 = LLT::scalar(1);
3282 
3283   auto One = B.buildFConstant(S64, 1.0);
3284 
3285   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3286     .addUse(LHS)
3287     .addUse(RHS)
3288     .addImm(0)
3289     .setMIFlags(Flags);
3290 
3291   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3292 
3293   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3294     .addUse(DivScale0.getReg(0))
3295     .setMIFlags(Flags);
3296 
3297   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3298   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3299   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3300 
3301   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3302     .addUse(LHS)
3303     .addUse(RHS)
3304     .addImm(1)
3305     .setMIFlags(Flags);
3306 
3307   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3308   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3309   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3310 
3311   Register Scale;
3312   if (!ST.hasUsableDivScaleConditionOutput()) {
3313     // Workaround a hardware bug on SI where the condition output from div_scale
3314     // is not usable.
3315 
3316     LLT S32 = LLT::scalar(32);
3317 
3318     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3319     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3320     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3321     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3322 
3323     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3324                               Scale1Unmerge.getReg(1));
3325     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3326                               Scale0Unmerge.getReg(1));
3327     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3328   } else {
3329     Scale = DivScale1.getReg(1);
3330   }
3331 
3332   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3333     .addUse(Fma4.getReg(0))
3334     .addUse(Fma3.getReg(0))
3335     .addUse(Mul.getReg(0))
3336     .addUse(Scale)
3337     .setMIFlags(Flags);
3338 
3339   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3340     .addUse(Fmas.getReg(0))
3341     .addUse(RHS)
3342     .addUse(LHS)
3343     .setMIFlags(Flags);
3344 
3345   MI.eraseFromParent();
3346   return true;
3347 }
3348 
3349 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3350                                                  MachineRegisterInfo &MRI,
3351                                                  MachineIRBuilder &B) const {
3352   Register Res = MI.getOperand(0).getReg();
3353   Register LHS = MI.getOperand(2).getReg();
3354   Register RHS = MI.getOperand(3).getReg();
3355   uint16_t Flags = MI.getFlags();
3356 
3357   LLT S32 = LLT::scalar(32);
3358   LLT S1 = LLT::scalar(1);
3359 
3360   auto Abs = B.buildFAbs(S32, RHS, Flags);
3361   const APFloat C0Val(1.0f);
3362 
3363   auto C0 = B.buildConstant(S32, 0x6f800000);
3364   auto C1 = B.buildConstant(S32, 0x2f800000);
3365   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3366 
3367   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3368   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3369 
3370   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3371 
3372   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3373     .addUse(Mul0.getReg(0))
3374     .setMIFlags(Flags);
3375 
3376   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3377 
3378   B.buildFMul(Res, Sel, Mul1, Flags);
3379 
3380   MI.eraseFromParent();
3381   return true;
3382 }
3383 
3384 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3385 // FIXME: Why do we handle this one but not other removed instructions?
3386 //
3387 // Reciprocal square root.  The clamp prevents infinite results, clamping
3388 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3389 // +-max_float.
3390 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3391                                                     MachineRegisterInfo &MRI,
3392                                                     MachineIRBuilder &B) const {
3393   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3394     return true;
3395 
3396   Register Dst = MI.getOperand(0).getReg();
3397   Register Src = MI.getOperand(2).getReg();
3398   auto Flags = MI.getFlags();
3399 
3400   LLT Ty = MRI.getType(Dst);
3401 
3402   const fltSemantics *FltSemantics;
3403   if (Ty == LLT::scalar(32))
3404     FltSemantics = &APFloat::IEEEsingle();
3405   else if (Ty == LLT::scalar(64))
3406     FltSemantics = &APFloat::IEEEdouble();
3407   else
3408     return false;
3409 
3410   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3411     .addUse(Src)
3412     .setMIFlags(Flags);
3413 
3414   // We don't need to concern ourselves with the snan handling difference, since
3415   // the rsq quieted (or not) so use the one which will directly select.
3416   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3417   const bool UseIEEE = MFI->getMode().IEEE;
3418 
3419   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3420   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3421                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3422 
3423   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3424 
3425   if (UseIEEE)
3426     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3427   else
3428     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3429   MI.eraseFromParent();
3430   return true;
3431 }
3432 
3433 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3434   switch (IID) {
3435   case Intrinsic::amdgcn_ds_fadd:
3436     return AMDGPU::G_ATOMICRMW_FADD;
3437   case Intrinsic::amdgcn_ds_fmin:
3438     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3439   case Intrinsic::amdgcn_ds_fmax:
3440     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3441   default:
3442     llvm_unreachable("not a DS FP intrinsic");
3443   }
3444 }
3445 
3446 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3447                                                       MachineInstr &MI,
3448                                                       Intrinsic::ID IID) const {
3449   GISelChangeObserver &Observer = Helper.Observer;
3450   Observer.changingInstr(MI);
3451 
3452   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3453 
3454   // The remaining operands were used to set fields in the MemOperand on
3455   // construction.
3456   for (int I = 6; I > 3; --I)
3457     MI.RemoveOperand(I);
3458 
3459   MI.RemoveOperand(1); // Remove the intrinsic ID.
3460   Observer.changedInstr(MI);
3461   return true;
3462 }
3463 
3464 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3465                                             MachineRegisterInfo &MRI,
3466                                             MachineIRBuilder &B) const {
3467   uint64_t Offset =
3468     ST.getTargetLowering()->getImplicitParameterOffset(
3469       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3470   LLT DstTy = MRI.getType(DstReg);
3471   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3472 
3473   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3474   if (!loadInputValue(KernargPtrReg, B,
3475                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3476     return false;
3477 
3478   // FIXME: This should be nuw
3479   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3480   return true;
3481 }
3482 
3483 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3484                                                  MachineRegisterInfo &MRI,
3485                                                  MachineIRBuilder &B) const {
3486   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3487   if (!MFI->isEntryFunction()) {
3488     return legalizePreloadedArgIntrin(MI, MRI, B,
3489                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3490   }
3491 
3492   Register DstReg = MI.getOperand(0).getReg();
3493   if (!getImplicitArgPtr(DstReg, MRI, B))
3494     return false;
3495 
3496   MI.eraseFromParent();
3497   return true;
3498 }
3499 
3500 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3501                                               MachineRegisterInfo &MRI,
3502                                               MachineIRBuilder &B,
3503                                               unsigned AddrSpace) const {
3504   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3505   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3506   Register Hi32 = Unmerge.getReg(1);
3507 
3508   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3509   MI.eraseFromParent();
3510   return true;
3511 }
3512 
3513 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3514 // offset (the offset that is included in bounds checking and swizzling, to be
3515 // split between the instruction's voffset and immoffset fields) and soffset
3516 // (the offset that is excluded from bounds checking and swizzling, to go in
3517 // the instruction's soffset field).  This function takes the first kind of
3518 // offset and figures out how to split it between voffset and immoffset.
3519 std::tuple<Register, unsigned, unsigned>
3520 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3521                                         Register OrigOffset) const {
3522   const unsigned MaxImm = 4095;
3523   Register BaseReg;
3524   unsigned TotalConstOffset;
3525   const LLT S32 = LLT::scalar(32);
3526   MachineRegisterInfo &MRI = *B.getMRI();
3527 
3528   std::tie(BaseReg, TotalConstOffset) =
3529       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3530 
3531   unsigned ImmOffset = TotalConstOffset;
3532 
3533   // If BaseReg is a pointer, convert it to int.
3534   if (MRI.getType(BaseReg).isPointer())
3535     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3536 
3537   // If the immediate value is too big for the immoffset field, put the value
3538   // and -4096 into the immoffset field so that the value that is copied/added
3539   // for the voffset field is a multiple of 4096, and it stands more chance
3540   // of being CSEd with the copy/add for another similar load/store.
3541   // However, do not do that rounding down to a multiple of 4096 if that is a
3542   // negative number, as it appears to be illegal to have a negative offset
3543   // in the vgpr, even if adding the immediate offset makes it positive.
3544   unsigned Overflow = ImmOffset & ~MaxImm;
3545   ImmOffset -= Overflow;
3546   if ((int32_t)Overflow < 0) {
3547     Overflow += ImmOffset;
3548     ImmOffset = 0;
3549   }
3550 
3551   if (Overflow != 0) {
3552     if (!BaseReg) {
3553       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3554     } else {
3555       auto OverflowVal = B.buildConstant(S32, Overflow);
3556       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3557     }
3558   }
3559 
3560   if (!BaseReg)
3561     BaseReg = B.buildConstant(S32, 0).getReg(0);
3562 
3563   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3564 }
3565 
3566 /// Handle register layout difference for f16 images for some subtargets.
3567 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3568                                              MachineRegisterInfo &MRI,
3569                                              Register Reg,
3570                                              bool ImageStore) const {
3571   const LLT S16 = LLT::scalar(16);
3572   const LLT S32 = LLT::scalar(32);
3573   LLT StoreVT = MRI.getType(Reg);
3574   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3575 
3576   if (ST.hasUnpackedD16VMem()) {
3577     auto Unmerge = B.buildUnmerge(S16, Reg);
3578 
3579     SmallVector<Register, 4> WideRegs;
3580     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3581       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3582 
3583     int NumElts = StoreVT.getNumElements();
3584 
3585     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3586   }
3587 
3588   if (ImageStore && ST.hasImageStoreD16Bug()) {
3589     if (StoreVT.getNumElements() == 2) {
3590       SmallVector<Register, 4> PackedRegs;
3591       Reg = B.buildBitcast(S32, Reg).getReg(0);
3592       PackedRegs.push_back(Reg);
3593       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3594       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3595     }
3596 
3597     if (StoreVT.getNumElements() == 3) {
3598       SmallVector<Register, 4> PackedRegs;
3599       auto Unmerge = B.buildUnmerge(S16, Reg);
3600       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3601         PackedRegs.push_back(Unmerge.getReg(I));
3602       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3603       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3604       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3605     }
3606 
3607     if (StoreVT.getNumElements() == 4) {
3608       SmallVector<Register, 4> PackedRegs;
3609       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3610       auto Unmerge = B.buildUnmerge(S32, Reg);
3611       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3612         PackedRegs.push_back(Unmerge.getReg(I));
3613       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3614       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3615     }
3616 
3617     llvm_unreachable("invalid data type");
3618   }
3619 
3620   return Reg;
3621 }
3622 
3623 Register AMDGPULegalizerInfo::fixStoreSourceType(
3624   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3625   MachineRegisterInfo *MRI = B.getMRI();
3626   LLT Ty = MRI->getType(VData);
3627 
3628   const LLT S16 = LLT::scalar(16);
3629 
3630   // Fixup illegal register types for i8 stores.
3631   if (Ty == LLT::scalar(8) || Ty == S16) {
3632     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3633     return AnyExt;
3634   }
3635 
3636   if (Ty.isVector()) {
3637     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3638       if (IsFormat)
3639         return handleD16VData(B, *MRI, VData);
3640     }
3641   }
3642 
3643   return VData;
3644 }
3645 
3646 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3647                                               MachineRegisterInfo &MRI,
3648                                               MachineIRBuilder &B,
3649                                               bool IsTyped,
3650                                               bool IsFormat) const {
3651   Register VData = MI.getOperand(1).getReg();
3652   LLT Ty = MRI.getType(VData);
3653   LLT EltTy = Ty.getScalarType();
3654   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3655   const LLT S32 = LLT::scalar(32);
3656 
3657   VData = fixStoreSourceType(B, VData, IsFormat);
3658   Register RSrc = MI.getOperand(2).getReg();
3659 
3660   MachineMemOperand *MMO = *MI.memoperands_begin();
3661   const int MemSize = MMO->getSize();
3662 
3663   unsigned ImmOffset;
3664   unsigned TotalOffset;
3665 
3666   // The typed intrinsics add an immediate after the registers.
3667   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3668 
3669   // The struct intrinsic variants add one additional operand over raw.
3670   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3671   Register VIndex;
3672   int OpOffset = 0;
3673   if (HasVIndex) {
3674     VIndex = MI.getOperand(3).getReg();
3675     OpOffset = 1;
3676   }
3677 
3678   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3679   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3680 
3681   unsigned Format = 0;
3682   if (IsTyped) {
3683     Format = MI.getOperand(5 + OpOffset).getImm();
3684     ++OpOffset;
3685   }
3686 
3687   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3688 
3689   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3690   if (TotalOffset != 0)
3691     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3692 
3693   unsigned Opc;
3694   if (IsTyped) {
3695     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3696                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3697   } else if (IsFormat) {
3698     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3699                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3700   } else {
3701     switch (MemSize) {
3702     case 1:
3703       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3704       break;
3705     case 2:
3706       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3707       break;
3708     default:
3709       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3710       break;
3711     }
3712   }
3713 
3714   if (!VIndex)
3715     VIndex = B.buildConstant(S32, 0).getReg(0);
3716 
3717   auto MIB = B.buildInstr(Opc)
3718     .addUse(VData)              // vdata
3719     .addUse(RSrc)               // rsrc
3720     .addUse(VIndex)             // vindex
3721     .addUse(VOffset)            // voffset
3722     .addUse(SOffset)            // soffset
3723     .addImm(ImmOffset);         // offset(imm)
3724 
3725   if (IsTyped)
3726     MIB.addImm(Format);
3727 
3728   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3729      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3730      .addMemOperand(MMO);
3731 
3732   MI.eraseFromParent();
3733   return true;
3734 }
3735 
3736 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3737                                              MachineRegisterInfo &MRI,
3738                                              MachineIRBuilder &B,
3739                                              bool IsFormat,
3740                                              bool IsTyped) const {
3741   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3742   MachineMemOperand *MMO = *MI.memoperands_begin();
3743   const int MemSize = MMO->getSize();
3744   const LLT S32 = LLT::scalar(32);
3745 
3746   Register Dst = MI.getOperand(0).getReg();
3747   Register RSrc = MI.getOperand(2).getReg();
3748 
3749   // The typed intrinsics add an immediate after the registers.
3750   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3751 
3752   // The struct intrinsic variants add one additional operand over raw.
3753   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3754   Register VIndex;
3755   int OpOffset = 0;
3756   if (HasVIndex) {
3757     VIndex = MI.getOperand(3).getReg();
3758     OpOffset = 1;
3759   }
3760 
3761   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3762   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3763 
3764   unsigned Format = 0;
3765   if (IsTyped) {
3766     Format = MI.getOperand(5 + OpOffset).getImm();
3767     ++OpOffset;
3768   }
3769 
3770   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3771   unsigned ImmOffset;
3772   unsigned TotalOffset;
3773 
3774   LLT Ty = MRI.getType(Dst);
3775   LLT EltTy = Ty.getScalarType();
3776   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3777   const bool Unpacked = ST.hasUnpackedD16VMem();
3778 
3779   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3780   if (TotalOffset != 0)
3781     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3782 
3783   unsigned Opc;
3784 
3785   if (IsTyped) {
3786     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3787                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3788   } else if (IsFormat) {
3789     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3790                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3791   } else {
3792     switch (MemSize) {
3793     case 1:
3794       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3795       break;
3796     case 2:
3797       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3798       break;
3799     default:
3800       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3801       break;
3802     }
3803   }
3804 
3805   Register LoadDstReg;
3806 
3807   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3808   LLT UnpackedTy = Ty.changeElementSize(32);
3809 
3810   if (IsExtLoad)
3811     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3812   else if (Unpacked && IsD16 && Ty.isVector())
3813     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3814   else
3815     LoadDstReg = Dst;
3816 
3817   if (!VIndex)
3818     VIndex = B.buildConstant(S32, 0).getReg(0);
3819 
3820   auto MIB = B.buildInstr(Opc)
3821     .addDef(LoadDstReg)         // vdata
3822     .addUse(RSrc)               // rsrc
3823     .addUse(VIndex)             // vindex
3824     .addUse(VOffset)            // voffset
3825     .addUse(SOffset)            // soffset
3826     .addImm(ImmOffset);         // offset(imm)
3827 
3828   if (IsTyped)
3829     MIB.addImm(Format);
3830 
3831   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3832      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3833      .addMemOperand(MMO);
3834 
3835   if (LoadDstReg != Dst) {
3836     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3837 
3838     // Widen result for extending loads was widened.
3839     if (IsExtLoad)
3840       B.buildTrunc(Dst, LoadDstReg);
3841     else {
3842       // Repack to original 16-bit vector result
3843       // FIXME: G_TRUNC should work, but legalization currently fails
3844       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3845       SmallVector<Register, 4> Repack;
3846       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3847         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3848       B.buildMerge(Dst, Repack);
3849     }
3850   }
3851 
3852   MI.eraseFromParent();
3853   return true;
3854 }
3855 
3856 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3857                                                MachineIRBuilder &B,
3858                                                bool IsInc) const {
3859   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3860                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3861   B.buildInstr(Opc)
3862     .addDef(MI.getOperand(0).getReg())
3863     .addUse(MI.getOperand(2).getReg())
3864     .addUse(MI.getOperand(3).getReg())
3865     .cloneMemRefs(MI);
3866   MI.eraseFromParent();
3867   return true;
3868 }
3869 
3870 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3871   switch (IntrID) {
3872   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3873   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3874     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3875   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3876   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3877     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3878   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3879   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3880     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3881   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3882   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3883     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3884   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3885   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3886     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3887   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3888   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3889     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3890   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3891   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3892     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3893   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3894   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3896   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3897   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3898     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3899   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3900   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3901     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3902   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3903   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3904     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3905   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3906   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3907     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3908   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3909   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3910     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3911   case Intrinsic::amdgcn_buffer_atomic_fadd:
3912   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3913   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3914     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3915   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3916   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3917     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3918   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3919   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3920     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3921   default:
3922     llvm_unreachable("unhandled atomic opcode");
3923   }
3924 }
3925 
3926 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3927                                                MachineIRBuilder &B,
3928                                                Intrinsic::ID IID) const {
3929   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3930                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3931   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3932 
3933   Register Dst;
3934 
3935   int OpOffset = 0;
3936   if (HasReturn) {
3937     // A few FP atomics do not support return values.
3938     Dst = MI.getOperand(0).getReg();
3939   } else {
3940     OpOffset = -1;
3941   }
3942 
3943   Register VData = MI.getOperand(2 + OpOffset).getReg();
3944   Register CmpVal;
3945 
3946   if (IsCmpSwap) {
3947     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3948     ++OpOffset;
3949   }
3950 
3951   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3952   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3953 
3954   // The struct intrinsic variants add one additional operand over raw.
3955   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3956   Register VIndex;
3957   if (HasVIndex) {
3958     VIndex = MI.getOperand(4 + OpOffset).getReg();
3959     ++OpOffset;
3960   }
3961 
3962   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3963   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3964   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3965 
3966   MachineMemOperand *MMO = *MI.memoperands_begin();
3967 
3968   unsigned ImmOffset;
3969   unsigned TotalOffset;
3970   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3971   if (TotalOffset != 0)
3972     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3973 
3974   if (!VIndex)
3975     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3976 
3977   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3978 
3979   if (HasReturn)
3980     MIB.addDef(Dst);
3981 
3982   MIB.addUse(VData); // vdata
3983 
3984   if (IsCmpSwap)
3985     MIB.addReg(CmpVal);
3986 
3987   MIB.addUse(RSrc)               // rsrc
3988      .addUse(VIndex)             // vindex
3989      .addUse(VOffset)            // voffset
3990      .addUse(SOffset)            // soffset
3991      .addImm(ImmOffset)          // offset(imm)
3992      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3993      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3994      .addMemOperand(MMO);
3995 
3996   MI.eraseFromParent();
3997   return true;
3998 }
3999 
4000 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4001 /// vector with s16 typed elements.
4002 static void packImageA16AddressToDwords(
4003     MachineIRBuilder &B, MachineInstr &MI,
4004     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4005     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4006   const LLT S16 = LLT::scalar(16);
4007   const LLT V2S16 = LLT::vector(2, 16);
4008 
4009   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4010     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4011     if (!SrcOp.isReg())
4012       continue; // _L to _LZ may have eliminated this.
4013 
4014     Register AddrReg = SrcOp.getReg();
4015 
4016     if (I < Intr->GradientStart) {
4017       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4018       PackedAddrs.push_back(AddrReg);
4019     } else {
4020       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4021       // derivatives dx/dh and dx/dv are packed with undef.
4022       if (((I + 1) >= EndIdx) ||
4023           ((Intr->NumGradients / 2) % 2 == 1 &&
4024            (I == static_cast<unsigned>(Intr->GradientStart +
4025                                        (Intr->NumGradients / 2) - 1) ||
4026             I == static_cast<unsigned>(Intr->GradientStart +
4027                                        Intr->NumGradients - 1))) ||
4028           // Check for _L to _LZ optimization
4029           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4030         PackedAddrs.push_back(
4031             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4032                 .getReg(0));
4033       } else {
4034         PackedAddrs.push_back(
4035             B.buildBuildVector(
4036                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4037                 .getReg(0));
4038         ++I;
4039       }
4040     }
4041   }
4042 }
4043 
4044 /// Convert from separate vaddr components to a single vector address register,
4045 /// and replace the remaining operands with $noreg.
4046 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4047                                      int DimIdx, int NumVAddrs) {
4048   const LLT S32 = LLT::scalar(32);
4049 
4050   SmallVector<Register, 8> AddrRegs;
4051   for (int I = 0; I != NumVAddrs; ++I) {
4052     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4053     if (SrcOp.isReg()) {
4054       AddrRegs.push_back(SrcOp.getReg());
4055       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4056     }
4057   }
4058 
4059   int NumAddrRegs = AddrRegs.size();
4060   if (NumAddrRegs != 1) {
4061     // Round up to 8 elements for v5-v7
4062     // FIXME: Missing intermediate sized register classes and instructions.
4063     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4064       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4065       auto Undef = B.buildUndef(S32);
4066       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4067       NumAddrRegs = RoundedNumRegs;
4068     }
4069 
4070     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4071     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4072   }
4073 
4074   for (int I = 1; I != NumVAddrs; ++I) {
4075     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4076     if (SrcOp.isReg())
4077       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4078   }
4079 }
4080 
4081 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4082 ///
4083 /// Depending on the subtarget, load/store with 16-bit element data need to be
4084 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4085 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4086 /// registers.
4087 ///
4088 /// We don't want to directly select image instructions just yet, but also want
4089 /// to exposes all register repacking to the legalizer/combiners. We also don't
4090 /// want a selected instrution entering RegBankSelect. In order to avoid
4091 /// defining a multitude of intermediate image instructions, directly hack on
4092 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4093 /// now unnecessary arguments with $noreg.
4094 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4095     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4096     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4097 
4098   const unsigned NumDefs = MI.getNumExplicitDefs();
4099   const unsigned ArgOffset = NumDefs + 1;
4100   bool IsTFE = NumDefs == 2;
4101   // We are only processing the operands of d16 image operations on subtargets
4102   // that use the unpacked register layout, or need to repack the TFE result.
4103 
4104   // TODO: Do we need to guard against already legalized intrinsics?
4105   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4106       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4107 
4108   MachineRegisterInfo *MRI = B.getMRI();
4109   const LLT S32 = LLT::scalar(32);
4110   const LLT S16 = LLT::scalar(16);
4111   const LLT V2S16 = LLT::vector(2, 16);
4112 
4113   unsigned DMask = 0;
4114 
4115   // Check for 16 bit addresses and pack if true.
4116   LLT GradTy =
4117       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4118   LLT AddrTy =
4119       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4120   const bool IsG16 = GradTy == S16;
4121   const bool IsA16 = AddrTy == S16;
4122 
4123   int DMaskLanes = 0;
4124   if (!BaseOpcode->Atomic) {
4125     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4126     if (BaseOpcode->Gather4) {
4127       DMaskLanes = 4;
4128     } else if (DMask != 0) {
4129       DMaskLanes = countPopulation(DMask);
4130     } else if (!IsTFE && !BaseOpcode->Store) {
4131       // If dmask is 0, this is a no-op load. This can be eliminated.
4132       B.buildUndef(MI.getOperand(0));
4133       MI.eraseFromParent();
4134       return true;
4135     }
4136   }
4137 
4138   Observer.changingInstr(MI);
4139   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4140 
4141   unsigned NewOpcode = NumDefs == 0 ?
4142     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4143 
4144   // Track that we legalized this
4145   MI.setDesc(B.getTII().get(NewOpcode));
4146 
4147   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4148   // dmask to be at least 1 otherwise the instruction will fail
4149   if (IsTFE && DMask == 0) {
4150     DMask = 0x1;
4151     DMaskLanes = 1;
4152     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4153   }
4154 
4155   if (BaseOpcode->Atomic) {
4156     Register VData0 = MI.getOperand(2).getReg();
4157     LLT Ty = MRI->getType(VData0);
4158 
4159     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4160     if (Ty.isVector())
4161       return false;
4162 
4163     if (BaseOpcode->AtomicX2) {
4164       Register VData1 = MI.getOperand(3).getReg();
4165       // The two values are packed in one register.
4166       LLT PackedTy = LLT::vector(2, Ty);
4167       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4168       MI.getOperand(2).setReg(Concat.getReg(0));
4169       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4170     }
4171   }
4172 
4173   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4174 
4175   // Optimize _L to _LZ when _L is zero
4176   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4177           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4178     const ConstantFP *ConstantLod;
4179 
4180     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4181                  m_GFCst(ConstantLod))) {
4182       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4183         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4184         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4185             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4186                                                       Intr->Dim);
4187 
4188         // The starting indexes should remain in the same place.
4189         --CorrectedNumVAddrs;
4190 
4191         MI.getOperand(MI.getNumExplicitDefs())
4192             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4193         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4194         Intr = NewImageDimIntr;
4195       }
4196     }
4197   }
4198 
4199   // Optimize _mip away, when 'lod' is zero
4200   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4201     int64_t ConstantLod;
4202     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4203                  m_ICst(ConstantLod))) {
4204       if (ConstantLod == 0) {
4205         // TODO: Change intrinsic opcode and remove operand instead or replacing
4206         // it with 0, as the _L to _LZ handling is done above.
4207         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4208         --CorrectedNumVAddrs;
4209       }
4210     }
4211   }
4212 
4213   // Rewrite the addressing register layout before doing anything else.
4214   if (IsA16 || IsG16) {
4215     if (IsA16) {
4216       // Target must support the feature and gradients need to be 16 bit too
4217       if (!ST.hasA16() || !IsG16)
4218         return false;
4219     } else if (!ST.hasG16())
4220       return false;
4221 
4222     if (Intr->NumVAddrs > 1) {
4223       SmallVector<Register, 4> PackedRegs;
4224       // Don't compress addresses for G16
4225       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4226       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4227                                   PackEndIdx);
4228 
4229       if (!IsA16) {
4230         // Add uncompressed address
4231         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4232           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4233           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4234           PackedRegs.push_back(AddrReg);
4235         }
4236       }
4237 
4238       // See also below in the non-a16 branch
4239       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4240 
4241       if (!UseNSA && PackedRegs.size() > 1) {
4242         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4243         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4244         PackedRegs[0] = Concat.getReg(0);
4245         PackedRegs.resize(1);
4246       }
4247 
4248       const unsigned NumPacked = PackedRegs.size();
4249       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4250         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4251         if (!SrcOp.isReg()) {
4252           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4253           continue;
4254         }
4255 
4256         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4257 
4258         if (I - Intr->VAddrStart < NumPacked)
4259           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4260         else
4261           SrcOp.setReg(AMDGPU::NoRegister);
4262       }
4263     }
4264   } else {
4265     // If the register allocator cannot place the address registers contiguously
4266     // without introducing moves, then using the non-sequential address encoding
4267     // is always preferable, since it saves VALU instructions and is usually a
4268     // wash in terms of code size or even better.
4269     //
4270     // However, we currently have no way of hinting to the register allocator
4271     // that MIMG addresses should be placed contiguously when it is possible to
4272     // do so, so force non-NSA for the common 2-address case as a heuristic.
4273     //
4274     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4275     // allocation when possible.
4276     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4277 
4278     if (!UseNSA && Intr->NumVAddrs > 1)
4279       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4280                                Intr->NumVAddrs);
4281   }
4282 
4283   int Flags = 0;
4284   if (IsA16)
4285     Flags |= 1;
4286   if (IsG16)
4287     Flags |= 2;
4288   MI.addOperand(MachineOperand::CreateImm(Flags));
4289 
4290   if (BaseOpcode->Store) { // No TFE for stores?
4291     // TODO: Handle dmask trim
4292     Register VData = MI.getOperand(1).getReg();
4293     LLT Ty = MRI->getType(VData);
4294     if (!Ty.isVector() || Ty.getElementType() != S16)
4295       return true;
4296 
4297     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4298     if (RepackedReg != VData) {
4299       MI.getOperand(1).setReg(RepackedReg);
4300     }
4301 
4302     return true;
4303   }
4304 
4305   Register DstReg = MI.getOperand(0).getReg();
4306   LLT Ty = MRI->getType(DstReg);
4307   const LLT EltTy = Ty.getScalarType();
4308   const bool IsD16 = Ty.getScalarType() == S16;
4309   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4310 
4311   // Confirm that the return type is large enough for the dmask specified
4312   if (NumElts < DMaskLanes)
4313     return false;
4314 
4315   if (NumElts > 4 || DMaskLanes > 4)
4316     return false;
4317 
4318   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4319   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4320 
4321   // The raw dword aligned data component of the load. The only legal cases
4322   // where this matters should be when using the packed D16 format, for
4323   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4324   LLT RoundedTy;
4325 
4326   // S32 vector to to cover all data, plus TFE result element.
4327   LLT TFETy;
4328 
4329   // Register type to use for each loaded component. Will be S32 or V2S16.
4330   LLT RegTy;
4331 
4332   if (IsD16 && ST.hasUnpackedD16VMem()) {
4333     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4334     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4335     RegTy = S32;
4336   } else {
4337     unsigned EltSize = EltTy.getSizeInBits();
4338     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4339     unsigned RoundedSize = 32 * RoundedElts;
4340     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4341     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4342     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4343   }
4344 
4345   // The return type does not need adjustment.
4346   // TODO: Should we change s16 case to s32 or <2 x s16>?
4347   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4348     return true;
4349 
4350   Register Dst1Reg;
4351 
4352   // Insert after the instruction.
4353   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4354 
4355   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4356   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4357   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4358   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4359 
4360   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4361 
4362   MI.getOperand(0).setReg(NewResultReg);
4363 
4364   // In the IR, TFE is supposed to be used with a 2 element struct return
4365   // type. The intruction really returns these two values in one contiguous
4366   // register, with one additional dword beyond the loaded data. Rewrite the
4367   // return type to use a single register result.
4368 
4369   if (IsTFE) {
4370     Dst1Reg = MI.getOperand(1).getReg();
4371     if (MRI->getType(Dst1Reg) != S32)
4372       return false;
4373 
4374     // TODO: Make sure the TFE operand bit is set.
4375     MI.RemoveOperand(1);
4376 
4377     // Handle the easy case that requires no repack instructions.
4378     if (Ty == S32) {
4379       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4380       return true;
4381     }
4382   }
4383 
4384   // Now figure out how to copy the new result register back into the old
4385   // result.
4386   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4387 
4388   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4389 
4390   if (ResultNumRegs == 1) {
4391     assert(!IsTFE);
4392     ResultRegs[0] = NewResultReg;
4393   } else {
4394     // We have to repack into a new vector of some kind.
4395     for (int I = 0; I != NumDataRegs; ++I)
4396       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4397     B.buildUnmerge(ResultRegs, NewResultReg);
4398 
4399     // Drop the final TFE element to get the data part. The TFE result is
4400     // directly written to the right place already.
4401     if (IsTFE)
4402       ResultRegs.resize(NumDataRegs);
4403   }
4404 
4405   // For an s16 scalar result, we form an s32 result with a truncate regardless
4406   // of packed vs. unpacked.
4407   if (IsD16 && !Ty.isVector()) {
4408     B.buildTrunc(DstReg, ResultRegs[0]);
4409     return true;
4410   }
4411 
4412   // Avoid a build/concat_vector of 1 entry.
4413   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4414     B.buildBitcast(DstReg, ResultRegs[0]);
4415     return true;
4416   }
4417 
4418   assert(Ty.isVector());
4419 
4420   if (IsD16) {
4421     // For packed D16 results with TFE enabled, all the data components are
4422     // S32. Cast back to the expected type.
4423     //
4424     // TODO: We don't really need to use load s32 elements. We would only need one
4425     // cast for the TFE result if a multiple of v2s16 was used.
4426     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4427       for (Register &Reg : ResultRegs)
4428         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4429     } else if (ST.hasUnpackedD16VMem()) {
4430       for (Register &Reg : ResultRegs)
4431         Reg = B.buildTrunc(S16, Reg).getReg(0);
4432     }
4433   }
4434 
4435   auto padWithUndef = [&](LLT Ty, int NumElts) {
4436     if (NumElts == 0)
4437       return;
4438     Register Undef = B.buildUndef(Ty).getReg(0);
4439     for (int I = 0; I != NumElts; ++I)
4440       ResultRegs.push_back(Undef);
4441   };
4442 
4443   // Pad out any elements eliminated due to the dmask.
4444   LLT ResTy = MRI->getType(ResultRegs[0]);
4445   if (!ResTy.isVector()) {
4446     padWithUndef(ResTy, NumElts - ResultRegs.size());
4447     B.buildBuildVector(DstReg, ResultRegs);
4448     return true;
4449   }
4450 
4451   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4452   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4453 
4454   // Deal with the one annoying legal case.
4455   const LLT V3S16 = LLT::vector(3, 16);
4456   if (Ty == V3S16) {
4457     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4458     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4459     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4460     return true;
4461   }
4462 
4463   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4464   B.buildConcatVectors(DstReg, ResultRegs);
4465   return true;
4466 }
4467 
4468 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4469   LegalizerHelper &Helper, MachineInstr &MI) const {
4470   MachineIRBuilder &B = Helper.MIRBuilder;
4471   GISelChangeObserver &Observer = Helper.Observer;
4472 
4473   Register Dst = MI.getOperand(0).getReg();
4474   LLT Ty = B.getMRI()->getType(Dst);
4475   unsigned Size = Ty.getSizeInBits();
4476   MachineFunction &MF = B.getMF();
4477 
4478   Observer.changingInstr(MI);
4479 
4480   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4481     Ty = getBitcastRegisterType(Ty);
4482     Helper.bitcastDst(MI, Ty, 0);
4483     Dst = MI.getOperand(0).getReg();
4484     B.setInsertPt(B.getMBB(), MI);
4485   }
4486 
4487   // FIXME: We don't really need this intermediate instruction. The intrinsic
4488   // should be fixed to have a memory operand. Since it's readnone, we're not
4489   // allowed to add one.
4490   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4491   MI.RemoveOperand(1); // Remove intrinsic ID
4492 
4493   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4494   // TODO: Should this use datalayout alignment?
4495   const unsigned MemSize = (Size + 7) / 8;
4496   const Align MemAlign(4);
4497   MachineMemOperand *MMO = MF.getMachineMemOperand(
4498       MachinePointerInfo(),
4499       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4500           MachineMemOperand::MOInvariant,
4501       MemSize, MemAlign);
4502   MI.addMemOperand(MF, MMO);
4503 
4504   // There are no 96-bit result scalar loads, but widening to 128-bit should
4505   // always be legal. We may need to restore this to a 96-bit result if it turns
4506   // out this needs to be converted to a vector load during RegBankSelect.
4507   if (!isPowerOf2_32(Size)) {
4508     if (Ty.isVector())
4509       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4510     else
4511       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4512   }
4513 
4514   Observer.changedInstr(MI);
4515   return true;
4516 }
4517 
4518 // TODO: Move to selection
4519 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4520                                                 MachineRegisterInfo &MRI,
4521                                                 MachineIRBuilder &B) const {
4522   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4523   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4524       !ST.isTrapHandlerEnabled()) {
4525     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4526   } else {
4527     // Pass queue pointer to trap handler as input, and insert trap instruction
4528     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4529     MachineRegisterInfo &MRI = *B.getMRI();
4530 
4531     Register LiveIn =
4532       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4533     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4534       return false;
4535 
4536     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4537     B.buildCopy(SGPR01, LiveIn);
4538     B.buildInstr(AMDGPU::S_TRAP)
4539         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4540         .addReg(SGPR01, RegState::Implicit);
4541   }
4542 
4543   MI.eraseFromParent();
4544   return true;
4545 }
4546 
4547 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4548     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4549   // Is non-HSA path or trap-handler disabled? then, report a warning
4550   // accordingly
4551   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4552       !ST.isTrapHandlerEnabled()) {
4553     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4554                                      "debugtrap handler not supported",
4555                                      MI.getDebugLoc(), DS_Warning);
4556     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4557     Ctx.diagnose(NoTrap);
4558   } else {
4559     // Insert debug-trap instruction
4560     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4561   }
4562 
4563   MI.eraseFromParent();
4564   return true;
4565 }
4566 
4567 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4568                                                MachineIRBuilder &B) const {
4569   MachineRegisterInfo &MRI = *B.getMRI();
4570   const LLT S16 = LLT::scalar(16);
4571   const LLT S32 = LLT::scalar(32);
4572 
4573   Register DstReg = MI.getOperand(0).getReg();
4574   Register NodePtr = MI.getOperand(2).getReg();
4575   Register RayExtent = MI.getOperand(3).getReg();
4576   Register RayOrigin = MI.getOperand(4).getReg();
4577   Register RayDir = MI.getOperand(5).getReg();
4578   Register RayInvDir = MI.getOperand(6).getReg();
4579   Register TDescr = MI.getOperand(7).getReg();
4580 
4581   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4582   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4583   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4584                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4585                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4586                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4587 
4588   SmallVector<Register, 12> Ops;
4589   if (Is64) {
4590     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4591     Ops.push_back(Unmerge.getReg(0));
4592     Ops.push_back(Unmerge.getReg(1));
4593   } else {
4594     Ops.push_back(NodePtr);
4595   }
4596   Ops.push_back(RayExtent);
4597 
4598   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4599     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4600     Ops.push_back(Unmerge.getReg(0));
4601     Ops.push_back(Unmerge.getReg(1));
4602     Ops.push_back(Unmerge.getReg(2));
4603   };
4604 
4605   packLanes(RayOrigin);
4606   if (IsA16) {
4607     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4608     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4609     Register R1 = MRI.createGenericVirtualRegister(S32);
4610     Register R2 = MRI.createGenericVirtualRegister(S32);
4611     Register R3 = MRI.createGenericVirtualRegister(S32);
4612     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4613     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4614     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4615     Ops.push_back(R1);
4616     Ops.push_back(R2);
4617     Ops.push_back(R3);
4618   } else {
4619     packLanes(RayDir);
4620     packLanes(RayInvDir);
4621   }
4622 
4623   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4624     .addDef(DstReg)
4625     .addImm(Opcode);
4626 
4627   for (Register R : Ops) {
4628     MIB.addUse(R);
4629   }
4630 
4631   MIB.addUse(TDescr)
4632      .addImm(IsA16 ? 1 : 0)
4633      .cloneMemRefs(MI);
4634 
4635   MI.eraseFromParent();
4636   return true;
4637 }
4638 
4639 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4640                                             MachineInstr &MI) const {
4641   MachineIRBuilder &B = Helper.MIRBuilder;
4642   MachineRegisterInfo &MRI = *B.getMRI();
4643 
4644   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4645   auto IntrID = MI.getIntrinsicID();
4646   switch (IntrID) {
4647   case Intrinsic::amdgcn_if:
4648   case Intrinsic::amdgcn_else: {
4649     MachineInstr *Br = nullptr;
4650     MachineBasicBlock *UncondBrTarget = nullptr;
4651     bool Negated = false;
4652     if (MachineInstr *BrCond =
4653             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4654       const SIRegisterInfo *TRI
4655         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4656 
4657       Register Def = MI.getOperand(1).getReg();
4658       Register Use = MI.getOperand(3).getReg();
4659 
4660       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4661 
4662       if (Negated)
4663         std::swap(CondBrTarget, UncondBrTarget);
4664 
4665       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4666       if (IntrID == Intrinsic::amdgcn_if) {
4667         B.buildInstr(AMDGPU::SI_IF)
4668           .addDef(Def)
4669           .addUse(Use)
4670           .addMBB(UncondBrTarget);
4671       } else {
4672         B.buildInstr(AMDGPU::SI_ELSE)
4673             .addDef(Def)
4674             .addUse(Use)
4675             .addMBB(UncondBrTarget);
4676       }
4677 
4678       if (Br) {
4679         Br->getOperand(0).setMBB(CondBrTarget);
4680       } else {
4681         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4682         // since we're swapping branch targets it needs to be reinserted.
4683         // FIXME: IRTranslator should probably not do this
4684         B.buildBr(*CondBrTarget);
4685       }
4686 
4687       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4688       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4689       MI.eraseFromParent();
4690       BrCond->eraseFromParent();
4691       return true;
4692     }
4693 
4694     return false;
4695   }
4696   case Intrinsic::amdgcn_loop: {
4697     MachineInstr *Br = nullptr;
4698     MachineBasicBlock *UncondBrTarget = nullptr;
4699     bool Negated = false;
4700     if (MachineInstr *BrCond =
4701             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4702       const SIRegisterInfo *TRI
4703         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4704 
4705       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4706       Register Reg = MI.getOperand(2).getReg();
4707 
4708       if (Negated)
4709         std::swap(CondBrTarget, UncondBrTarget);
4710 
4711       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4712       B.buildInstr(AMDGPU::SI_LOOP)
4713         .addUse(Reg)
4714         .addMBB(UncondBrTarget);
4715 
4716       if (Br)
4717         Br->getOperand(0).setMBB(CondBrTarget);
4718       else
4719         B.buildBr(*CondBrTarget);
4720 
4721       MI.eraseFromParent();
4722       BrCond->eraseFromParent();
4723       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4724       return true;
4725     }
4726 
4727     return false;
4728   }
4729   case Intrinsic::amdgcn_kernarg_segment_ptr:
4730     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4731       // This only makes sense to call in a kernel, so just lower to null.
4732       B.buildConstant(MI.getOperand(0).getReg(), 0);
4733       MI.eraseFromParent();
4734       return true;
4735     }
4736 
4737     return legalizePreloadedArgIntrin(
4738       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4739   case Intrinsic::amdgcn_implicitarg_ptr:
4740     return legalizeImplicitArgPtr(MI, MRI, B);
4741   case Intrinsic::amdgcn_workitem_id_x:
4742     return legalizePreloadedArgIntrin(MI, MRI, B,
4743                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4744   case Intrinsic::amdgcn_workitem_id_y:
4745     return legalizePreloadedArgIntrin(MI, MRI, B,
4746                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4747   case Intrinsic::amdgcn_workitem_id_z:
4748     return legalizePreloadedArgIntrin(MI, MRI, B,
4749                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4750   case Intrinsic::amdgcn_workgroup_id_x:
4751     return legalizePreloadedArgIntrin(MI, MRI, B,
4752                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4753   case Intrinsic::amdgcn_workgroup_id_y:
4754     return legalizePreloadedArgIntrin(MI, MRI, B,
4755                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4756   case Intrinsic::amdgcn_workgroup_id_z:
4757     return legalizePreloadedArgIntrin(MI, MRI, B,
4758                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4759   case Intrinsic::amdgcn_dispatch_ptr:
4760     return legalizePreloadedArgIntrin(MI, MRI, B,
4761                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4762   case Intrinsic::amdgcn_queue_ptr:
4763     return legalizePreloadedArgIntrin(MI, MRI, B,
4764                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4765   case Intrinsic::amdgcn_implicit_buffer_ptr:
4766     return legalizePreloadedArgIntrin(
4767       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4768   case Intrinsic::amdgcn_dispatch_id:
4769     return legalizePreloadedArgIntrin(MI, MRI, B,
4770                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4771   case Intrinsic::amdgcn_fdiv_fast:
4772     return legalizeFDIVFastIntrin(MI, MRI, B);
4773   case Intrinsic::amdgcn_is_shared:
4774     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4775   case Intrinsic::amdgcn_is_private:
4776     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4777   case Intrinsic::amdgcn_wavefrontsize: {
4778     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4779     MI.eraseFromParent();
4780     return true;
4781   }
4782   case Intrinsic::amdgcn_s_buffer_load:
4783     return legalizeSBufferLoad(Helper, MI);
4784   case Intrinsic::amdgcn_raw_buffer_store:
4785   case Intrinsic::amdgcn_struct_buffer_store:
4786     return legalizeBufferStore(MI, MRI, B, false, false);
4787   case Intrinsic::amdgcn_raw_buffer_store_format:
4788   case Intrinsic::amdgcn_struct_buffer_store_format:
4789     return legalizeBufferStore(MI, MRI, B, false, true);
4790   case Intrinsic::amdgcn_raw_tbuffer_store:
4791   case Intrinsic::amdgcn_struct_tbuffer_store:
4792     return legalizeBufferStore(MI, MRI, B, true, true);
4793   case Intrinsic::amdgcn_raw_buffer_load:
4794   case Intrinsic::amdgcn_struct_buffer_load:
4795     return legalizeBufferLoad(MI, MRI, B, false, false);
4796   case Intrinsic::amdgcn_raw_buffer_load_format:
4797   case Intrinsic::amdgcn_struct_buffer_load_format:
4798     return legalizeBufferLoad(MI, MRI, B, true, false);
4799   case Intrinsic::amdgcn_raw_tbuffer_load:
4800   case Intrinsic::amdgcn_struct_tbuffer_load:
4801     return legalizeBufferLoad(MI, MRI, B, true, true);
4802   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4803   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4804   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4805   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4806   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4807   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4808   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4809   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4810   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4811   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4812   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4813   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4814   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4815   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4816   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4817   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4818   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4819   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4820   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4821   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4822   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4823   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4824   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4825   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4826   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4827   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4828   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4829   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4830   case Intrinsic::amdgcn_buffer_atomic_fadd:
4831   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4832   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4833   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4834   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4835     return legalizeBufferAtomic(MI, B, IntrID);
4836   case Intrinsic::amdgcn_atomic_inc:
4837     return legalizeAtomicIncDec(MI, B, true);
4838   case Intrinsic::amdgcn_atomic_dec:
4839     return legalizeAtomicIncDec(MI, B, false);
4840   case Intrinsic::trap:
4841     return legalizeTrapIntrinsic(MI, MRI, B);
4842   case Intrinsic::debugtrap:
4843     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4844   case Intrinsic::amdgcn_rsq_clamp:
4845     return legalizeRsqClampIntrinsic(MI, MRI, B);
4846   case Intrinsic::amdgcn_ds_fadd:
4847   case Intrinsic::amdgcn_ds_fmin:
4848   case Intrinsic::amdgcn_ds_fmax:
4849     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4850   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4851     return legalizeBVHIntrinsic(MI, B);
4852   default: {
4853     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4854             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4855       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4856     return true;
4857   }
4858   }
4859 
4860   return true;
4861 }
4862