1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38   "amdgpu-global-isel-new-legality",
39   cl::desc("Use GlobalISel desired legality, rather than try to use"
40            "rules compatible with selection patterns"),
41   cl::init(false),
42   cl::ReallyHidden);
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     if (!Ty.isVector())
67       return false;
68 
69     const LLT EltTy = Ty.getElementType();
70     const unsigned EltSize = EltTy.getSizeInBits();
71     return Ty.getNumElements() % 2 != 0 &&
72            EltSize > 1 && EltSize < 32 &&
73            Ty.getSizeInBits() % 32 != 0;
74   };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78   return [=](const LegalityQuery &Query) {
79     const LLT Ty = Query.Types[TypeIdx];
80     return Ty.getSizeInBits() % 32 == 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130   const unsigned Size = Ty.getSizeInBits();
131 
132   LLT CoercedTy;
133   if (Size <= 32) {
134     // <2 x s8> -> s16
135     // <4 x s8> -> s32
136     return LLT::scalar(Size);
137   }
138 
139   return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143   return [=](const LegalityQuery &Query) {
144     const LLT Ty = Query.Types[TypeIdx];
145     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146   };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT Ty = Query.Types[TypeIdx];
152     unsigned Size = Ty.getSizeInBits();
153     assert(Size % 32 == 0);
154     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155   };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159   return [=](const LegalityQuery &Query) {
160     const LLT QueryTy = Query.Types[TypeIdx];
161     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162   };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166   return [=](const LegalityQuery &Query) {
167     const LLT QueryTy = Query.Types[TypeIdx];
168     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169   };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173   return [=](const LegalityQuery &Query) {
174     const LLT QueryTy = Query.Types[TypeIdx];
175     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176   };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180   return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184   const int EltSize = EltTy.getSizeInBits();
185   return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189   const int EltSize = Ty.getElementType().getSizeInBits();
190   return EltSize == 32 || EltSize == 64 ||
191          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192          EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196   if (!isRegisterSize(Ty.getSizeInBits()))
197     return false;
198 
199   if (Ty.isVector())
200     return isRegisterVectorType(Ty);
201 
202   return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208   return [=](const LegalityQuery &Query) {
209     return isRegisterType(Query.Types[TypeIdx]);
210   };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     const LLT QueryTy = Query.Types[TypeIdx];
216     if (!QueryTy.isVector())
217       return false;
218     const LLT EltTy = QueryTy.getElementType();
219     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220   };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224   return [=](const LegalityQuery &Query) {
225     const LLT Ty = Query.Types[TypeIdx];
226     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228   };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235                                     bool IsLoad) {
236   switch (AS) {
237   case AMDGPUAS::PRIVATE_ADDRESS:
238     // FIXME: Private element size.
239     return ST.enableFlatScratch() ? 128 : 32;
240   case AMDGPUAS::LOCAL_ADDRESS:
241     return ST.useDS128() ? 128 : 64;
242   case AMDGPUAS::GLOBAL_ADDRESS:
243   case AMDGPUAS::CONSTANT_ADDRESS:
244   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245     // Treat constant and global as identical. SMRD loads are sometimes usable for
246     // global loads (ideally constant address space should be eliminated)
247     // depending on the context. Legality cannot be context dependent, but
248     // RegBankSelect can split the load as necessary depending on the pointer
249     // register bank/uniformity and if the memory is invariant or not written in a
250     // kernel.
251     return IsLoad ? 512 : 128;
252   default:
253     // Flat addresses may contextually need to be split to 32-bit parts if they
254     // may alias scratch depending on the subtarget.
255     return 128;
256   }
257 }
258 
259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260                                  const LegalityQuery &Query,
261                                  unsigned Opcode) {
262   const LLT Ty = Query.Types[0];
263 
264   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265   const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267   unsigned RegSize = Ty.getSizeInBits();
268   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270   unsigned AS = Query.Types[1].getAddressSpace();
271 
272   // All of these need to be custom lowered to cast the pointer operand.
273   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274     return false;
275 
276   // TODO: We should be able to widen loads if the alignment is high enough, but
277   // we also need to modify the memory access size.
278 #if 0
279   // Accept widening loads based on alignment.
280   if (IsLoad && MemSize < Size)
281     MemSize = std::max(MemSize, Align);
282 #endif
283 
284   // Only 1-byte and 2-byte to 32-bit extloads are valid.
285   if (MemSize != RegSize && RegSize != 32)
286     return false;
287 
288   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289     return false;
290 
291   switch (MemSize) {
292   case 8:
293   case 16:
294   case 32:
295   case 64:
296   case 128:
297     break;
298   case 96:
299     if (!ST.hasDwordx3LoadStores())
300       return false;
301     break;
302   case 256:
303   case 512:
304     // These may contextually need to be broken down.
305     break;
306   default:
307     return false;
308   }
309 
310   assert(RegSize >= MemSize);
311 
312   if (AlignBits < MemSize) {
313     const SITargetLowering *TLI = ST.getTargetLowering();
314     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315                                                  Align(AlignBits / 8)))
316       return false;
317   }
318 
319   return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327   if (EnableNewLegality)
328     return false;
329 
330   const unsigned Size = Ty.getSizeInBits();
331   if (Size <= 64)
332     return false;
333   if (!Ty.isVector())
334     return true;
335 
336   LLT EltTy = Ty.getElementType();
337   if (EltTy.isPointer())
338     return true;
339 
340   unsigned EltSize = EltTy.getSizeInBits();
341   return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345                              unsigned Opcode) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583     .customFor({S32, S64})
584     .clampScalar(0, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .scalarize(0);
587 
588   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589                    .legalFor({S32})
590                    .maxScalarOrElt(0, S32);
591 
592   if (ST.hasVOP3PInsts()) {
593     Mulh
594       .clampMaxNumElements(0, S8, 2)
595       .lowerFor({V2S8});
596   }
597 
598   Mulh
599     .scalarize(0)
600     .lower();
601 
602   // Report legal for any types we can handle anywhere. For the cases only legal
603   // on the SALU, RegBankSelect will be able to re-legalize.
604   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606     .clampScalar(0, S32, S64)
607     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609     .widenScalarToNextPow2(0)
610     .scalarize(0);
611 
612   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614     .legalFor({{S32, S1}, {S32, S32}})
615     .minScalar(0, S32)
616     // TODO: .scalarize(0)
617     .lower();
618 
619   getActionDefinitionsBuilder(G_BITCAST)
620     // Don't worry about the size constraint.
621     .legalIf(all(isRegisterType(0), isRegisterType(1)))
622     .lower();
623 
624 
625   getActionDefinitionsBuilder(G_CONSTANT)
626     .legalFor({S1, S32, S64, S16, GlobalPtr,
627                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628     .legalIf(isPointer(0))
629     .clampScalar(0, S32, S64)
630     .widenScalarToNextPow2(0);
631 
632   getActionDefinitionsBuilder(G_FCONSTANT)
633     .legalFor({S32, S64, S16})
634     .clampScalar(0, S16, S64);
635 
636   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637       .legalIf(isRegisterType(0))
638       // s1 and s16 are special cases because they have legal operations on
639       // them, but don't really occupy registers in the normal way.
640       .legalFor({S1, S16})
641       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642       .clampScalarOrElt(0, S32, MaxScalar)
643       .widenScalarToNextPow2(0, 32)
644       .clampMaxNumElements(0, S32, 16);
645 
646   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648   // If the amount is divergent, we have to do a wave reduction to get the
649   // maximum value, so this is expanded during RegBankSelect.
650   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651     .legalFor({{PrivatePtr, S32}});
652 
653   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654     .customIf(typeIsNot(0, PrivatePtr));
655 
656   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658   auto &FPOpActions = getActionDefinitionsBuilder(
659     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660     .legalFor({S32, S64});
661   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662     .customFor({S32, S64});
663   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664     .customFor({S32, S64});
665 
666   if (ST.has16BitInsts()) {
667     if (ST.hasVOP3PInsts())
668       FPOpActions.legalFor({S16, V2S16});
669     else
670       FPOpActions.legalFor({S16});
671 
672     TrigActions.customFor({S16});
673     FDIVActions.customFor({S16});
674   }
675 
676   auto &MinNumMaxNum = getActionDefinitionsBuilder({
677       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679   if (ST.hasVOP3PInsts()) {
680     MinNumMaxNum.customFor(FPTypesPK16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .clampMaxNumElements(0, S16, 2)
683       .clampScalar(0, S16, S64)
684       .scalarize(0);
685   } else if (ST.has16BitInsts()) {
686     MinNumMaxNum.customFor(FPTypes16)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else {
690     MinNumMaxNum.customFor(FPTypesBase)
691       .clampScalar(0, S32, S64)
692       .scalarize(0);
693   }
694 
695   if (ST.hasVOP3PInsts())
696     FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698   FPOpActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   TrigActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   FDIVActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   getActionDefinitionsBuilder({G_FNEG, G_FABS})
711     .legalFor(FPTypesPK16)
712     .clampMaxNumElements(0, S16, 2)
713     .scalarize(0)
714     .clampScalar(0, S16, S64);
715 
716   if (ST.has16BitInsts()) {
717     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718       .legalFor({S32, S64, S16})
719       .scalarize(0)
720       .clampScalar(0, S16, S64);
721   } else {
722     getActionDefinitionsBuilder(G_FSQRT)
723       .legalFor({S32, S64})
724       .scalarize(0)
725       .clampScalar(0, S32, S64);
726 
727     if (ST.hasFractBug()) {
728       getActionDefinitionsBuilder(G_FFLOOR)
729         .customFor({S64})
730         .legalFor({S32, S64})
731         .scalarize(0)
732         .clampScalar(0, S32, S64);
733     } else {
734       getActionDefinitionsBuilder(G_FFLOOR)
735         .legalFor({S32, S64})
736         .scalarize(0)
737         .clampScalar(0, S32, S64);
738     }
739   }
740 
741   getActionDefinitionsBuilder(G_FPTRUNC)
742     .legalFor({{S32, S64}, {S16, S32}})
743     .scalarize(0)
744     .lower();
745 
746   getActionDefinitionsBuilder(G_FPEXT)
747     .legalFor({{S64, S32}, {S32, S16}})
748     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749     .scalarize(0);
750 
751   getActionDefinitionsBuilder(G_FSUB)
752       // Use actual fsub instruction
753       .legalFor({S32})
754       // Must use fadd + fneg
755       .lowerFor({S64, S16, V2S16})
756       .scalarize(0)
757       .clampScalar(0, S32, S64);
758 
759   // Whether this is legal depends on the floating point mode for the function.
760   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762     FMad.customFor({S32, S16});
763   else if (ST.hasMadMacF32Insts())
764     FMad.customFor({S32});
765   else if (ST.hasMadF16())
766     FMad.customFor({S16});
767   FMad.scalarize(0)
768       .lower();
769 
770   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771   if (ST.has16BitInsts()) {
772     FRem.customFor({S16, S32, S64});
773   } else {
774     FRem.minScalar(0, S32)
775         .customFor({S32, S64});
776   }
777   FRem.scalarize(0);
778 
779   // TODO: Do we need to clamp maximum bitwidth?
780   getActionDefinitionsBuilder(G_TRUNC)
781     .legalIf(isScalar(0))
782     .legalFor({{V2S16, V2S32}})
783     .clampMaxNumElements(0, S16, 2)
784     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785     // situations (like an invalid implicit use), we don't want to infinite loop
786     // in the legalizer.
787     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788     .alwaysLegal();
789 
790   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792                {S32, S1}, {S64, S1}, {S16, S1}})
793     .scalarize(0)
794     .clampScalar(0, S32, S64)
795     .widenScalarToNextPow2(1, 32);
796 
797   // TODO: Split s1->s64 during regbankselect for VALU.
798   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800     .lowerFor({{S32, S64}})
801     .lowerIf(typeIs(1, S1))
802     .customFor({{S64, S64}});
803   if (ST.has16BitInsts())
804     IToFP.legalFor({{S16, S16}});
805   IToFP.clampScalar(1, S32, S64)
806        .minScalar(0, S32)
807        .scalarize(0)
808        .widenScalarToNextPow2(1);
809 
810   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812     .customFor({{S64, S64}})
813     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814   if (ST.has16BitInsts())
815     FPToI.legalFor({{S16, S16}});
816   else
817     FPToI.minScalar(1, S32);
818 
819   FPToI.minScalar(0, S32)
820        .scalarize(0)
821        .lower();
822 
823   // Lower roundeven into G_FRINT
824   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825     .scalarize(0)
826     .lower();
827 
828   if (ST.has16BitInsts()) {
829     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830       .legalFor({S16, S32, S64})
831       .clampScalar(0, S16, S64)
832       .scalarize(0);
833   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835       .legalFor({S32, S64})
836       .clampScalar(0, S32, S64)
837       .scalarize(0);
838   } else {
839     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840       .legalFor({S32})
841       .customFor({S64})
842       .clampScalar(0, S32, S64)
843       .scalarize(0);
844   }
845 
846   getActionDefinitionsBuilder(G_PTR_ADD)
847     .legalIf(all(isPointer(0), sameSize(0, 1)))
848     .scalarize(0)
849     .scalarSameSizeAs(1, 0);
850 
851   getActionDefinitionsBuilder(G_PTRMASK)
852     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853     .scalarSameSizeAs(1, 0)
854     .scalarize(0);
855 
856   auto &CmpBuilder =
857     getActionDefinitionsBuilder(G_ICMP)
858     // The compare output type differs based on the register bank of the output,
859     // so make both s1 and s32 legal.
860     //
861     // Scalar compares producing output in scc will be promoted to s32, as that
862     // is the allocatable register type that will be needed for the copy from
863     // scc. This will be promoted during RegBankSelect, and we assume something
864     // before that won't try to use s32 result types.
865     //
866     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867     // bank.
868     .legalForCartesianProduct(
869       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870     .legalForCartesianProduct(
871       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872   if (ST.has16BitInsts()) {
873     CmpBuilder.legalFor({{S1, S16}});
874   }
875 
876   CmpBuilder
877     .widenScalarToNextPow2(1)
878     .clampScalar(1, S32, S64)
879     .scalarize(0)
880     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882   getActionDefinitionsBuilder(G_FCMP)
883     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884     .widenScalarToNextPow2(1)
885     .clampScalar(1, S32, S64)
886     .scalarize(0);
887 
888   // FIXME: fpow has a selection pattern that should move to custom lowering.
889   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890   if (ST.has16BitInsts())
891     Exp2Ops.legalFor({S32, S16});
892   else
893     Exp2Ops.legalFor({S32});
894   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895   Exp2Ops.scalarize(0);
896 
897   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898   if (ST.has16BitInsts())
899     ExpOps.customFor({{S32}, {S16}});
900   else
901     ExpOps.customFor({S32});
902   ExpOps.clampScalar(0, MinScalarFPTy, S32)
903         .scalarize(0);
904 
905   getActionDefinitionsBuilder(G_FPOWI)
906     .clampScalar(0, MinScalarFPTy, S32)
907     .lower();
908 
909   // The 64-bit versions produce 32-bit results, but only on the SALU.
910   getActionDefinitionsBuilder(G_CTPOP)
911     .legalFor({{S32, S32}, {S32, S64}})
912     .clampScalar(0, S32, S32)
913     .clampScalar(1, S32, S64)
914     .scalarize(0)
915     .widenScalarToNextPow2(0, 32)
916     .widenScalarToNextPow2(1, 32);
917 
918   // The hardware instructions return a different result on 0 than the generic
919   // instructions expect. The hardware produces -1, but these produce the
920   // bitwidth.
921   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922     .scalarize(0)
923     .clampScalar(0, S32, S32)
924     .clampScalar(1, S32, S64)
925     .widenScalarToNextPow2(0, 32)
926     .widenScalarToNextPow2(1, 32)
927     .lower();
928 
929   // The 64-bit versions produce 32-bit results, but only on the SALU.
930   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931     .legalFor({{S32, S32}, {S32, S64}})
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32)
936     .widenScalarToNextPow2(1, 32);
937 
938   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
939   // RegBankSelect.
940   getActionDefinitionsBuilder(G_BITREVERSE)
941     .legalFor({S32, S64})
942     .clampScalar(0, S32, S64)
943     .scalarize(0)
944     .widenScalarToNextPow2(0);
945 
946   if (ST.has16BitInsts()) {
947     getActionDefinitionsBuilder(G_BSWAP)
948       .legalFor({S16, S32, V2S16})
949       .clampMaxNumElements(0, S16, 2)
950       // FIXME: Fixing non-power-of-2 before clamp is workaround for
951       // narrowScalar limitation.
952       .widenScalarToNextPow2(0)
953       .clampScalar(0, S16, S32)
954       .scalarize(0);
955 
956     if (ST.hasVOP3PInsts()) {
957       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
958         .legalFor({S32, S16, V2S16})
959         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
960         .clampMaxNumElements(0, S16, 2)
961         .minScalar(0, S16)
962         .widenScalarToNextPow2(0)
963         .scalarize(0)
964         .lower();
965     } else {
966       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
967         .legalFor({S32, S16})
968         .widenScalarToNextPow2(0)
969         .minScalar(0, S16)
970         .scalarize(0)
971         .lower();
972     }
973   } else {
974     // TODO: Should have same legality without v_perm_b32
975     getActionDefinitionsBuilder(G_BSWAP)
976       .legalFor({S32})
977       .lowerIf(scalarNarrowerThan(0, 32))
978       // FIXME: Fixing non-power-of-2 before clamp is workaround for
979       // narrowScalar limitation.
980       .widenScalarToNextPow2(0)
981       .maxScalar(0, S32)
982       .scalarize(0)
983       .lower();
984 
985     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
986       .legalFor({S32})
987       .minScalar(0, S32)
988       .widenScalarToNextPow2(0)
989       .scalarize(0)
990       .lower();
991   }
992 
993   getActionDefinitionsBuilder(G_INTTOPTR)
994     // List the common cases
995     .legalForCartesianProduct(AddrSpaces64, {S64})
996     .legalForCartesianProduct(AddrSpaces32, {S32})
997     .scalarize(0)
998     // Accept any address space as long as the size matches
999     .legalIf(sameSize(0, 1))
1000     .widenScalarIf(smallerThan(1, 0),
1001       [](const LegalityQuery &Query) {
1002         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1003       })
1004     .narrowScalarIf(largerThan(1, 0),
1005       [](const LegalityQuery &Query) {
1006         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1007       });
1008 
1009   getActionDefinitionsBuilder(G_PTRTOINT)
1010     // List the common cases
1011     .legalForCartesianProduct(AddrSpaces64, {S64})
1012     .legalForCartesianProduct(AddrSpaces32, {S32})
1013     .scalarize(0)
1014     // Accept any address space as long as the size matches
1015     .legalIf(sameSize(0, 1))
1016     .widenScalarIf(smallerThan(0, 1),
1017       [](const LegalityQuery &Query) {
1018         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1019       })
1020     .narrowScalarIf(
1021       largerThan(0, 1),
1022       [](const LegalityQuery &Query) {
1023         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1024       });
1025 
1026   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1027     .scalarize(0)
1028     .custom();
1029 
1030   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1031                                     bool IsLoad) -> bool {
1032     const LLT DstTy = Query.Types[0];
1033 
1034     // Split vector extloads.
1035     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1036     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1037 
1038     if (MemSize < DstTy.getSizeInBits())
1039       MemSize = std::max(MemSize, AlignBits);
1040 
1041     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1042       return true;
1043 
1044     const LLT PtrTy = Query.Types[1];
1045     unsigned AS = PtrTy.getAddressSpace();
1046     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1047       return true;
1048 
1049     // Catch weird sized loads that don't evenly divide into the access sizes
1050     // TODO: May be able to widen depending on alignment etc.
1051     unsigned NumRegs = (MemSize + 31) / 32;
1052     if (NumRegs == 3) {
1053       if (!ST.hasDwordx3LoadStores())
1054         return true;
1055     } else {
1056       // If the alignment allows, these should have been widened.
1057       if (!isPowerOf2_32(NumRegs))
1058         return true;
1059     }
1060 
1061     if (AlignBits < MemSize) {
1062       const SITargetLowering *TLI = ST.getTargetLowering();
1063       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1064                                                       Align(AlignBits / 8));
1065     }
1066 
1067     return false;
1068   };
1069 
1070   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1071   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1072   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1073 
1074   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1075   // LDS
1076   // TODO: Unsupported flat for SI.
1077 
1078   for (unsigned Op : {G_LOAD, G_STORE}) {
1079     const bool IsStore = Op == G_STORE;
1080 
1081     auto &Actions = getActionDefinitionsBuilder(Op);
1082     // Explicitly list some common cases.
1083     // TODO: Does this help compile time at all?
1084     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1085                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1086                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1087                                       {S64, GlobalPtr, 64, GlobalAlign32},
1088                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1089                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1090                                       {S32, GlobalPtr, 8, GlobalAlign8},
1091                                       {S32, GlobalPtr, 16, GlobalAlign16},
1092 
1093                                       {S32, LocalPtr, 32, 32},
1094                                       {S64, LocalPtr, 64, 32},
1095                                       {V2S32, LocalPtr, 64, 32},
1096                                       {S32, LocalPtr, 8, 8},
1097                                       {S32, LocalPtr, 16, 16},
1098                                       {V2S16, LocalPtr, 32, 32},
1099 
1100                                       {S32, PrivatePtr, 32, 32},
1101                                       {S32, PrivatePtr, 8, 8},
1102                                       {S32, PrivatePtr, 16, 16},
1103                                       {V2S16, PrivatePtr, 32, 32},
1104 
1105                                       {S32, ConstantPtr, 32, GlobalAlign32},
1106                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1107                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1108                                       {S64, ConstantPtr, 64, GlobalAlign32},
1109                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1110     Actions.legalIf(
1111       [=](const LegalityQuery &Query) -> bool {
1112         return isLoadStoreLegal(ST, Query, Op);
1113       });
1114 
1115     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1116     // 64-bits.
1117     //
1118     // TODO: Should generalize bitcast action into coerce, which will also cover
1119     // inserting addrspacecasts.
1120     Actions.customIf(typeIs(1, Constant32Ptr));
1121 
1122     // Turn any illegal element vectors into something easier to deal
1123     // with. These will ultimately produce 32-bit scalar shifts to extract the
1124     // parts anyway.
1125     //
1126     // For odd 16-bit element vectors, prefer to split those into pieces with
1127     // 16-bit vector parts.
1128     Actions.bitcastIf(
1129       [=](const LegalityQuery &Query) -> bool {
1130         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1131                                           Query.MMODescrs[0].SizeInBits);
1132       }, bitcastToRegisterType(0));
1133 
1134     if (!IsStore) {
1135       // Widen suitably aligned loads by loading extra bytes. The standard
1136       // legalization actions can't properly express widening memory operands.
1137       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1138         return shouldWidenLoad(ST, Query, G_LOAD);
1139       });
1140     }
1141 
1142     // FIXME: load/store narrowing should be moved to lower action
1143     Actions
1144         .narrowScalarIf(
1145             [=](const LegalityQuery &Query) -> bool {
1146               return !Query.Types[0].isVector() &&
1147                      needToSplitMemOp(Query, Op == G_LOAD);
1148             },
1149             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1150               const LLT DstTy = Query.Types[0];
1151               const LLT PtrTy = Query.Types[1];
1152 
1153               const unsigned DstSize = DstTy.getSizeInBits();
1154               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1155 
1156               // Split extloads.
1157               if (DstSize > MemSize)
1158                 return std::make_pair(0, LLT::scalar(MemSize));
1159 
1160               if (!isPowerOf2_32(DstSize)) {
1161                 // We're probably decomposing an odd sized store. Try to split
1162                 // to the widest type. TODO: Account for alignment. As-is it
1163                 // should be OK, since the new parts will be further legalized.
1164                 unsigned FloorSize = PowerOf2Floor(DstSize);
1165                 return std::make_pair(0, LLT::scalar(FloorSize));
1166               }
1167 
1168               if (DstSize > 32 && (DstSize % 32 != 0)) {
1169                 // FIXME: Need a way to specify non-extload of larger size if
1170                 // suitably aligned.
1171                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1172               }
1173 
1174               unsigned MaxSize = maxSizeForAddrSpace(ST,
1175                                                      PtrTy.getAddressSpace(),
1176                                                      Op == G_LOAD);
1177               if (MemSize > MaxSize)
1178                 return std::make_pair(0, LLT::scalar(MaxSize));
1179 
1180               unsigned Align = Query.MMODescrs[0].AlignInBits;
1181               return std::make_pair(0, LLT::scalar(Align));
1182             })
1183         .fewerElementsIf(
1184             [=](const LegalityQuery &Query) -> bool {
1185               return Query.Types[0].isVector() &&
1186                      needToSplitMemOp(Query, Op == G_LOAD);
1187             },
1188             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1189               const LLT DstTy = Query.Types[0];
1190               const LLT PtrTy = Query.Types[1];
1191 
1192               LLT EltTy = DstTy.getElementType();
1193               unsigned MaxSize = maxSizeForAddrSpace(ST,
1194                                                      PtrTy.getAddressSpace(),
1195                                                      Op == G_LOAD);
1196 
1197               // FIXME: Handle widened to power of 2 results better. This ends
1198               // up scalarizing.
1199               // FIXME: 3 element stores scalarized on SI
1200 
1201               // Split if it's too large for the address space.
1202               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1203                 unsigned NumElts = DstTy.getNumElements();
1204                 unsigned EltSize = EltTy.getSizeInBits();
1205 
1206                 if (MaxSize % EltSize == 0) {
1207                   return std::make_pair(
1208                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1209                 }
1210 
1211                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1212 
1213                 // FIXME: Refine when odd breakdowns handled
1214                 // The scalars will need to be re-legalized.
1215                 if (NumPieces == 1 || NumPieces >= NumElts ||
1216                     NumElts % NumPieces != 0)
1217                   return std::make_pair(0, EltTy);
1218 
1219                 return std::make_pair(0,
1220                                       LLT::vector(NumElts / NumPieces, EltTy));
1221               }
1222 
1223               // FIXME: We could probably handle weird extending loads better.
1224               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1225               if (DstTy.getSizeInBits() > MemSize)
1226                 return std::make_pair(0, EltTy);
1227 
1228               unsigned EltSize = EltTy.getSizeInBits();
1229               unsigned DstSize = DstTy.getSizeInBits();
1230               if (!isPowerOf2_32(DstSize)) {
1231                 // We're probably decomposing an odd sized store. Try to split
1232                 // to the widest type. TODO: Account for alignment. As-is it
1233                 // should be OK, since the new parts will be further legalized.
1234                 unsigned FloorSize = PowerOf2Floor(DstSize);
1235                 return std::make_pair(
1236                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1237               }
1238 
1239               // Need to split because of alignment.
1240               unsigned Align = Query.MMODescrs[0].AlignInBits;
1241               if (EltSize > Align &&
1242                   (EltSize / Align < DstTy.getNumElements())) {
1243                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1244               }
1245 
1246               // May need relegalization for the scalars.
1247               return std::make_pair(0, EltTy);
1248             })
1249     .lowerIfMemSizeNotPow2()
1250     .minScalar(0, S32);
1251 
1252     if (IsStore)
1253       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1254 
1255     Actions
1256         .widenScalarToNextPow2(0)
1257         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1258         .lower();
1259   }
1260 
1261   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1262                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1263                                                   {S32, GlobalPtr, 16, 2 * 8},
1264                                                   {S32, LocalPtr, 8, 8},
1265                                                   {S32, LocalPtr, 16, 16},
1266                                                   {S32, PrivatePtr, 8, 8},
1267                                                   {S32, PrivatePtr, 16, 16},
1268                                                   {S32, ConstantPtr, 8, 8},
1269                                                   {S32, ConstantPtr, 16, 2 * 8}});
1270   if (ST.hasFlatAddressSpace()) {
1271     ExtLoads.legalForTypesWithMemDesc(
1272         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1273   }
1274 
1275   ExtLoads.clampScalar(0, S32, S32)
1276           .widenScalarToNextPow2(0)
1277           .unsupportedIfMemSizeNotPow2()
1278           .lower();
1279 
1280   auto &Atomics = getActionDefinitionsBuilder(
1281     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1282      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1283      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1284      G_ATOMICRMW_UMIN})
1285     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1286                {S64, GlobalPtr}, {S64, LocalPtr},
1287                {S32, RegionPtr}, {S64, RegionPtr}});
1288   if (ST.hasFlatAddressSpace()) {
1289     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1290   }
1291 
1292   if (ST.hasLDSFPAtomics()) {
1293     auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1294       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1295     if (ST.hasGFX90AInsts())
1296       Atomic.legalFor({{S64, LocalPtr}});
1297   }
1298 
1299   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1300   // demarshalling
1301   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1302     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1303                 {S32, FlatPtr}, {S64, FlatPtr}})
1304     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1305                {S32, RegionPtr}, {S64, RegionPtr}});
1306   // TODO: Pointer types, any 32-bit or 64-bit vector
1307 
1308   // Condition should be s32 for scalar, s1 for vector.
1309   getActionDefinitionsBuilder(G_SELECT)
1310     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1311           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1312           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1313     .clampScalar(0, S16, S64)
1314     .scalarize(1)
1315     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1316     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1317     .clampMaxNumElements(0, S32, 2)
1318     .clampMaxNumElements(0, LocalPtr, 2)
1319     .clampMaxNumElements(0, PrivatePtr, 2)
1320     .scalarize(0)
1321     .widenScalarToNextPow2(0)
1322     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1323 
1324   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1325   // be more flexible with the shift amount type.
1326   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1327     .legalFor({{S32, S32}, {S64, S32}});
1328   if (ST.has16BitInsts()) {
1329     if (ST.hasVOP3PInsts()) {
1330       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1331             .clampMaxNumElements(0, S16, 2);
1332     } else
1333       Shifts.legalFor({{S16, S16}});
1334 
1335     // TODO: Support 16-bit shift amounts for all types
1336     Shifts.widenScalarIf(
1337       [=](const LegalityQuery &Query) {
1338         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1339         // 32-bit amount.
1340         const LLT ValTy = Query.Types[0];
1341         const LLT AmountTy = Query.Types[1];
1342         return ValTy.getSizeInBits() <= 16 &&
1343                AmountTy.getSizeInBits() < 16;
1344       }, changeTo(1, S16));
1345     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1346     Shifts.clampScalar(1, S32, S32);
1347     Shifts.clampScalar(0, S16, S64);
1348     Shifts.widenScalarToNextPow2(0, 16);
1349 
1350     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1351       .minScalar(0, S16)
1352       .scalarize(0)
1353       .lower();
1354   } else {
1355     // Make sure we legalize the shift amount type first, as the general
1356     // expansion for the shifted type will produce much worse code if it hasn't
1357     // been truncated already.
1358     Shifts.clampScalar(1, S32, S32);
1359     Shifts.clampScalar(0, S32, S64);
1360     Shifts.widenScalarToNextPow2(0, 32);
1361 
1362     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1363       .minScalar(0, S32)
1364       .scalarize(0)
1365       .lower();
1366   }
1367   Shifts.scalarize(0);
1368 
1369   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1370     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1371     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1372     unsigned IdxTypeIdx = 2;
1373 
1374     getActionDefinitionsBuilder(Op)
1375       .customIf([=](const LegalityQuery &Query) {
1376           const LLT EltTy = Query.Types[EltTypeIdx];
1377           const LLT VecTy = Query.Types[VecTypeIdx];
1378           const LLT IdxTy = Query.Types[IdxTypeIdx];
1379           const unsigned EltSize = EltTy.getSizeInBits();
1380           return (EltSize == 32 || EltSize == 64) &&
1381                   VecTy.getSizeInBits() % 32 == 0 &&
1382                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1383                   IdxTy.getSizeInBits() == 32;
1384         })
1385       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1386                  bitcastToVectorElement32(VecTypeIdx))
1387       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1388       .bitcastIf(
1389         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1390         [=](const LegalityQuery &Query) {
1391           // For > 64-bit element types, try to turn this into a 64-bit
1392           // element vector since we may be able to do better indexing
1393           // if this is scalar. If not, fall back to 32.
1394           const LLT EltTy = Query.Types[EltTypeIdx];
1395           const LLT VecTy = Query.Types[VecTypeIdx];
1396           const unsigned DstEltSize = EltTy.getSizeInBits();
1397           const unsigned VecSize = VecTy.getSizeInBits();
1398 
1399           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1400           return std::make_pair(
1401             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1402         })
1403       .clampScalar(EltTypeIdx, S32, S64)
1404       .clampScalar(VecTypeIdx, S32, S64)
1405       .clampScalar(IdxTypeIdx, S32, S32)
1406       .clampMaxNumElements(VecTypeIdx, S32, 32)
1407       // TODO: Clamp elements for 64-bit vectors?
1408       // It should only be necessary with variable indexes.
1409       // As a last resort, lower to the stack
1410       .lower();
1411   }
1412 
1413   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1414     .unsupportedIf([=](const LegalityQuery &Query) {
1415         const LLT &EltTy = Query.Types[1].getElementType();
1416         return Query.Types[0] != EltTy;
1417       });
1418 
1419   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1420     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1421     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1422 
1423     // FIXME: Doesn't handle extract of illegal sizes.
1424     getActionDefinitionsBuilder(Op)
1425       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1426       // FIXME: Multiples of 16 should not be legal.
1427       .legalIf([=](const LegalityQuery &Query) {
1428           const LLT BigTy = Query.Types[BigTyIdx];
1429           const LLT LitTy = Query.Types[LitTyIdx];
1430           return (BigTy.getSizeInBits() % 32 == 0) &&
1431                  (LitTy.getSizeInBits() % 16 == 0);
1432         })
1433       .widenScalarIf(
1434         [=](const LegalityQuery &Query) {
1435           const LLT BigTy = Query.Types[BigTyIdx];
1436           return (BigTy.getScalarSizeInBits() < 16);
1437         },
1438         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1439       .widenScalarIf(
1440         [=](const LegalityQuery &Query) {
1441           const LLT LitTy = Query.Types[LitTyIdx];
1442           return (LitTy.getScalarSizeInBits() < 16);
1443         },
1444         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1445       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1446       .widenScalarToNextPow2(BigTyIdx, 32);
1447 
1448   }
1449 
1450   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1451     .legalForCartesianProduct(AllS32Vectors, {S32})
1452     .legalForCartesianProduct(AllS64Vectors, {S64})
1453     .clampNumElements(0, V16S32, V32S32)
1454     .clampNumElements(0, V2S64, V16S64)
1455     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1456 
1457   if (ST.hasScalarPackInsts()) {
1458     BuildVector
1459       // FIXME: Should probably widen s1 vectors straight to s32
1460       .minScalarOrElt(0, S16)
1461       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1462       .minScalar(1, S32);
1463 
1464     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1465       .legalFor({V2S16, S32})
1466       .lower();
1467     BuildVector.minScalarOrElt(0, S32);
1468   } else {
1469     BuildVector.customFor({V2S16, S16});
1470     BuildVector.minScalarOrElt(0, S32);
1471 
1472     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1473       .customFor({V2S16, S32})
1474       .lower();
1475   }
1476 
1477   BuildVector.legalIf(isRegisterType(0));
1478 
1479   // FIXME: Clamp maximum size
1480   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1481     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1482     .clampMaxNumElements(0, S32, 32)
1483     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1484     .clampMaxNumElements(0, S16, 64);
1485 
1486   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1487   // pre-legalize.
1488   if (ST.hasVOP3PInsts()) {
1489     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1490       .customFor({V2S16, V2S16})
1491       .lower();
1492   } else
1493     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1494 
1495   // Merge/Unmerge
1496   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1497     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1498     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1499 
1500     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1501       const LLT Ty = Query.Types[TypeIdx];
1502       if (Ty.isVector()) {
1503         const LLT &EltTy = Ty.getElementType();
1504         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1505           return true;
1506         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1507           return true;
1508       }
1509       return false;
1510     };
1511 
1512     auto &Builder = getActionDefinitionsBuilder(Op)
1513       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1514       .lowerFor({{S16, V2S16}})
1515       .lowerIf([=](const LegalityQuery &Query) {
1516           const LLT BigTy = Query.Types[BigTyIdx];
1517           return BigTy.getSizeInBits() == 32;
1518         })
1519       // Try to widen to s16 first for small types.
1520       // TODO: Only do this on targets with legal s16 shifts
1521       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1522       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1523       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1524       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1525                            elementTypeIs(1, S16)),
1526                        changeTo(1, V2S16))
1527       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1528       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1529       // valid.
1530       .clampScalar(LitTyIdx, S32, S512)
1531       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1532       // Break up vectors with weird elements into scalars
1533       .fewerElementsIf(
1534         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1535         scalarize(0))
1536       .fewerElementsIf(
1537         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1538         scalarize(1))
1539       .clampScalar(BigTyIdx, S32, MaxScalar);
1540 
1541     if (Op == G_MERGE_VALUES) {
1542       Builder.widenScalarIf(
1543         // TODO: Use 16-bit shifts if legal for 8-bit values?
1544         [=](const LegalityQuery &Query) {
1545           const LLT Ty = Query.Types[LitTyIdx];
1546           return Ty.getSizeInBits() < 32;
1547         },
1548         changeTo(LitTyIdx, S32));
1549     }
1550 
1551     Builder.widenScalarIf(
1552       [=](const LegalityQuery &Query) {
1553         const LLT Ty = Query.Types[BigTyIdx];
1554         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1555           Ty.getSizeInBits() % 16 != 0;
1556       },
1557       [=](const LegalityQuery &Query) {
1558         // Pick the next power of 2, or a multiple of 64 over 128.
1559         // Whichever is smaller.
1560         const LLT &Ty = Query.Types[BigTyIdx];
1561         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1562         if (NewSizeInBits >= 256) {
1563           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1564           if (RoundedTo < NewSizeInBits)
1565             NewSizeInBits = RoundedTo;
1566         }
1567         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1568       })
1569       // Any vectors left are the wrong size. Scalarize them.
1570       .scalarize(0)
1571       .scalarize(1);
1572   }
1573 
1574   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1575   // RegBankSelect.
1576   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1577     .legalFor({{S32}, {S64}});
1578 
1579   if (ST.hasVOP3PInsts()) {
1580     SextInReg.lowerFor({{V2S16}})
1581       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1582       // get more vector shift opportunities, since we'll get those when
1583       // expanded.
1584       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1585   } else if (ST.has16BitInsts()) {
1586     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1587   } else {
1588     // Prefer to promote to s32 before lowering if we don't have 16-bit
1589     // shifts. This avoid a lot of intermediate truncate and extend operations.
1590     SextInReg.lowerFor({{S32}, {S64}});
1591   }
1592 
1593   SextInReg
1594     .scalarize(0)
1595     .clampScalar(0, S32, S64)
1596     .lower();
1597 
1598   // TODO: Only Try to form v2s16 with legal packed instructions.
1599   getActionDefinitionsBuilder(G_FSHR)
1600     .legalFor({{S32, S32}})
1601     .lowerFor({{V2S16, V2S16}})
1602     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1603     .scalarize(0)
1604     .lower();
1605 
1606   if (ST.hasVOP3PInsts()) {
1607     getActionDefinitionsBuilder(G_FSHL)
1608       .lowerFor({{V2S16, V2S16}})
1609       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1610       .scalarize(0)
1611       .lower();
1612   } else {
1613     getActionDefinitionsBuilder(G_FSHL)
1614       .scalarize(0)
1615       .lower();
1616   }
1617 
1618   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1619     .legalFor({S64});
1620 
1621   getActionDefinitionsBuilder(G_FENCE)
1622     .alwaysLegal();
1623 
1624   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1625       .scalarize(0)
1626       .minScalar(0, S32)
1627       .lower();
1628 
1629   getActionDefinitionsBuilder({
1630       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1631       G_FCOPYSIGN,
1632 
1633       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1634       G_ATOMICRMW_NAND,
1635       G_ATOMICRMW_FSUB,
1636       G_READ_REGISTER,
1637       G_WRITE_REGISTER,
1638 
1639       G_SADDO, G_SSUBO,
1640 
1641        // TODO: Implement
1642       G_FMINIMUM, G_FMAXIMUM}).lower();
1643 
1644   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1645         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1646         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1647     .unsupported();
1648 
1649   computeTables();
1650   verify(*ST.getInstrInfo());
1651 }
1652 
1653 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1654                                          MachineInstr &MI) const {
1655   MachineIRBuilder &B = Helper.MIRBuilder;
1656   MachineRegisterInfo &MRI = *B.getMRI();
1657 
1658   switch (MI.getOpcode()) {
1659   case TargetOpcode::G_ADDRSPACE_CAST:
1660     return legalizeAddrSpaceCast(MI, MRI, B);
1661   case TargetOpcode::G_FRINT:
1662     return legalizeFrint(MI, MRI, B);
1663   case TargetOpcode::G_FCEIL:
1664     return legalizeFceil(MI, MRI, B);
1665   case TargetOpcode::G_FREM:
1666     return legalizeFrem(MI, MRI, B);
1667   case TargetOpcode::G_INTRINSIC_TRUNC:
1668     return legalizeIntrinsicTrunc(MI, MRI, B);
1669   case TargetOpcode::G_SITOFP:
1670     return legalizeITOFP(MI, MRI, B, true);
1671   case TargetOpcode::G_UITOFP:
1672     return legalizeITOFP(MI, MRI, B, false);
1673   case TargetOpcode::G_FPTOSI:
1674     return legalizeFPTOI(MI, MRI, B, true);
1675   case TargetOpcode::G_FPTOUI:
1676     return legalizeFPTOI(MI, MRI, B, false);
1677   case TargetOpcode::G_FMINNUM:
1678   case TargetOpcode::G_FMAXNUM:
1679   case TargetOpcode::G_FMINNUM_IEEE:
1680   case TargetOpcode::G_FMAXNUM_IEEE:
1681     return legalizeMinNumMaxNum(Helper, MI);
1682   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1683     return legalizeExtractVectorElt(MI, MRI, B);
1684   case TargetOpcode::G_INSERT_VECTOR_ELT:
1685     return legalizeInsertVectorElt(MI, MRI, B);
1686   case TargetOpcode::G_SHUFFLE_VECTOR:
1687     return legalizeShuffleVector(MI, MRI, B);
1688   case TargetOpcode::G_FSIN:
1689   case TargetOpcode::G_FCOS:
1690     return legalizeSinCos(MI, MRI, B);
1691   case TargetOpcode::G_GLOBAL_VALUE:
1692     return legalizeGlobalValue(MI, MRI, B);
1693   case TargetOpcode::G_LOAD:
1694     return legalizeLoad(Helper, MI);
1695   case TargetOpcode::G_FMAD:
1696     return legalizeFMad(MI, MRI, B);
1697   case TargetOpcode::G_FDIV:
1698     return legalizeFDIV(MI, MRI, B);
1699   case TargetOpcode::G_UDIV:
1700   case TargetOpcode::G_UREM:
1701     return legalizeUDIV_UREM(MI, MRI, B);
1702   case TargetOpcode::G_SDIV:
1703   case TargetOpcode::G_SREM:
1704     return legalizeSDIV_SREM(MI, MRI, B);
1705   case TargetOpcode::G_ATOMIC_CMPXCHG:
1706     return legalizeAtomicCmpXChg(MI, MRI, B);
1707   case TargetOpcode::G_FLOG:
1708     return legalizeFlog(MI, B, numbers::ln2f);
1709   case TargetOpcode::G_FLOG10:
1710     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1711   case TargetOpcode::G_FEXP:
1712     return legalizeFExp(MI, B);
1713   case TargetOpcode::G_FPOW:
1714     return legalizeFPow(MI, B);
1715   case TargetOpcode::G_FFLOOR:
1716     return legalizeFFloor(MI, MRI, B);
1717   case TargetOpcode::G_BUILD_VECTOR:
1718     return legalizeBuildVector(MI, MRI, B);
1719   default:
1720     return false;
1721   }
1722 
1723   llvm_unreachable("expected switch to return");
1724 }
1725 
1726 Register AMDGPULegalizerInfo::getSegmentAperture(
1727   unsigned AS,
1728   MachineRegisterInfo &MRI,
1729   MachineIRBuilder &B) const {
1730   MachineFunction &MF = B.getMF();
1731   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1732   const LLT S32 = LLT::scalar(32);
1733 
1734   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1735 
1736   if (ST.hasApertureRegs()) {
1737     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1738     // getreg.
1739     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1740         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1741         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1742     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1743         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1744         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1745     unsigned Encoding =
1746         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1747         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1748         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1749 
1750     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1751 
1752     B.buildInstr(AMDGPU::S_GETREG_B32)
1753       .addDef(GetReg)
1754       .addImm(Encoding);
1755     MRI.setType(GetReg, S32);
1756 
1757     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1758     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1759   }
1760 
1761   Register QueuePtr = MRI.createGenericVirtualRegister(
1762     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1763 
1764   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1765     return Register();
1766 
1767   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1768   // private_segment_aperture_base_hi.
1769   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1770 
1771   // TODO: can we be smarter about machine pointer info?
1772   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1773   MachineMemOperand *MMO = MF.getMachineMemOperand(
1774       PtrInfo,
1775       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1776           MachineMemOperand::MOInvariant,
1777       4, commonAlignment(Align(64), StructOffset));
1778 
1779   Register LoadAddr;
1780 
1781   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1782   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1783 }
1784 
1785 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1786   MachineInstr &MI, MachineRegisterInfo &MRI,
1787   MachineIRBuilder &B) const {
1788   MachineFunction &MF = B.getMF();
1789 
1790   const LLT S32 = LLT::scalar(32);
1791   Register Dst = MI.getOperand(0).getReg();
1792   Register Src = MI.getOperand(1).getReg();
1793 
1794   LLT DstTy = MRI.getType(Dst);
1795   LLT SrcTy = MRI.getType(Src);
1796   unsigned DestAS = DstTy.getAddressSpace();
1797   unsigned SrcAS = SrcTy.getAddressSpace();
1798 
1799   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1800   // vector element.
1801   assert(!DstTy.isVector());
1802 
1803   const AMDGPUTargetMachine &TM
1804     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1805 
1806   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1807     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1808     return true;
1809   }
1810 
1811   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1812     // Truncate.
1813     B.buildExtract(Dst, Src, 0);
1814     MI.eraseFromParent();
1815     return true;
1816   }
1817 
1818   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1819     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1820     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1821 
1822     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1823     // another. Merge operands are required to be the same type, but creating an
1824     // extra ptrtoint would be kind of pointless.
1825     auto HighAddr = B.buildConstant(
1826       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1827     B.buildMerge(Dst, {Src, HighAddr});
1828     MI.eraseFromParent();
1829     return true;
1830   }
1831 
1832   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1833     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1834            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1835     unsigned NullVal = TM.getNullPointerValue(DestAS);
1836 
1837     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1838     auto FlatNull = B.buildConstant(SrcTy, 0);
1839 
1840     // Extract low 32-bits of the pointer.
1841     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1842 
1843     auto CmpRes =
1844         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1845     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1846 
1847     MI.eraseFromParent();
1848     return true;
1849   }
1850 
1851   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1852     return false;
1853 
1854   if (!ST.hasFlatAddressSpace())
1855     return false;
1856 
1857   auto SegmentNull =
1858       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1859   auto FlatNull =
1860       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1861 
1862   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1863   if (!ApertureReg.isValid())
1864     return false;
1865 
1866   auto CmpRes =
1867       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1868 
1869   // Coerce the type of the low half of the result so we can use merge_values.
1870   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1871 
1872   // TODO: Should we allow mismatched types but matching sizes in merges to
1873   // avoid the ptrtoint?
1874   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1875   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1876 
1877   MI.eraseFromParent();
1878   return true;
1879 }
1880 
1881 bool AMDGPULegalizerInfo::legalizeFrint(
1882   MachineInstr &MI, MachineRegisterInfo &MRI,
1883   MachineIRBuilder &B) const {
1884   Register Src = MI.getOperand(1).getReg();
1885   LLT Ty = MRI.getType(Src);
1886   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1887 
1888   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1889   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1890 
1891   auto C1 = B.buildFConstant(Ty, C1Val);
1892   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1893 
1894   // TODO: Should this propagate fast-math-flags?
1895   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1896   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1897 
1898   auto C2 = B.buildFConstant(Ty, C2Val);
1899   auto Fabs = B.buildFAbs(Ty, Src);
1900 
1901   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1902   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1903   MI.eraseFromParent();
1904   return true;
1905 }
1906 
1907 bool AMDGPULegalizerInfo::legalizeFceil(
1908   MachineInstr &MI, MachineRegisterInfo &MRI,
1909   MachineIRBuilder &B) const {
1910 
1911   const LLT S1 = LLT::scalar(1);
1912   const LLT S64 = LLT::scalar(64);
1913 
1914   Register Src = MI.getOperand(1).getReg();
1915   assert(MRI.getType(Src) == S64);
1916 
1917   // result = trunc(src)
1918   // if (src > 0.0 && src != result)
1919   //   result += 1.0
1920 
1921   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1922 
1923   const auto Zero = B.buildFConstant(S64, 0.0);
1924   const auto One = B.buildFConstant(S64, 1.0);
1925   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1926   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1927   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1928   auto Add = B.buildSelect(S64, And, One, Zero);
1929 
1930   // TODO: Should this propagate fast-math-flags?
1931   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1932   return true;
1933 }
1934 
1935 bool AMDGPULegalizerInfo::legalizeFrem(
1936   MachineInstr &MI, MachineRegisterInfo &MRI,
1937   MachineIRBuilder &B) const {
1938     Register DstReg = MI.getOperand(0).getReg();
1939     Register Src0Reg = MI.getOperand(1).getReg();
1940     Register Src1Reg = MI.getOperand(2).getReg();
1941     auto Flags = MI.getFlags();
1942     LLT Ty = MRI.getType(DstReg);
1943 
1944     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1945     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1946     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1947     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1948     MI.eraseFromParent();
1949     return true;
1950 }
1951 
1952 static MachineInstrBuilder extractF64Exponent(Register Hi,
1953                                               MachineIRBuilder &B) {
1954   const unsigned FractBits = 52;
1955   const unsigned ExpBits = 11;
1956   LLT S32 = LLT::scalar(32);
1957 
1958   auto Const0 = B.buildConstant(S32, FractBits - 32);
1959   auto Const1 = B.buildConstant(S32, ExpBits);
1960 
1961   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1962     .addUse(Hi)
1963     .addUse(Const0.getReg(0))
1964     .addUse(Const1.getReg(0));
1965 
1966   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1967 }
1968 
1969 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1970   MachineInstr &MI, MachineRegisterInfo &MRI,
1971   MachineIRBuilder &B) const {
1972   const LLT S1 = LLT::scalar(1);
1973   const LLT S32 = LLT::scalar(32);
1974   const LLT S64 = LLT::scalar(64);
1975 
1976   Register Src = MI.getOperand(1).getReg();
1977   assert(MRI.getType(Src) == S64);
1978 
1979   // TODO: Should this use extract since the low half is unused?
1980   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1981   Register Hi = Unmerge.getReg(1);
1982 
1983   // Extract the upper half, since this is where we will find the sign and
1984   // exponent.
1985   auto Exp = extractF64Exponent(Hi, B);
1986 
1987   const unsigned FractBits = 52;
1988 
1989   // Extract the sign bit.
1990   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1991   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1992 
1993   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1994 
1995   const auto Zero32 = B.buildConstant(S32, 0);
1996 
1997   // Extend back to 64-bits.
1998   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1999 
2000   auto Shr = B.buildAShr(S64, FractMask, Exp);
2001   auto Not = B.buildNot(S64, Shr);
2002   auto Tmp0 = B.buildAnd(S64, Src, Not);
2003   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2004 
2005   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2006   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2007 
2008   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2009   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 bool AMDGPULegalizerInfo::legalizeITOFP(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B, bool Signed) const {
2017 
2018   Register Dst = MI.getOperand(0).getReg();
2019   Register Src = MI.getOperand(1).getReg();
2020 
2021   const LLT S64 = LLT::scalar(64);
2022   const LLT S32 = LLT::scalar(32);
2023 
2024   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2025 
2026   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2027 
2028   auto CvtHi = Signed ?
2029     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2030     B.buildUITOFP(S64, Unmerge.getReg(1));
2031 
2032   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2033 
2034   auto ThirtyTwo = B.buildConstant(S32, 32);
2035   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2036     .addUse(CvtHi.getReg(0))
2037     .addUse(ThirtyTwo.getReg(0));
2038 
2039   // TODO: Should this propagate fast-math-flags?
2040   B.buildFAdd(Dst, LdExp, CvtLo);
2041   MI.eraseFromParent();
2042   return true;
2043 }
2044 
2045 // TODO: Copied from DAG implementation. Verify logic and document how this
2046 // actually works.
2047 bool AMDGPULegalizerInfo::legalizeFPTOI(
2048   MachineInstr &MI, MachineRegisterInfo &MRI,
2049   MachineIRBuilder &B, bool Signed) const {
2050 
2051   Register Dst = MI.getOperand(0).getReg();
2052   Register Src = MI.getOperand(1).getReg();
2053 
2054   const LLT S64 = LLT::scalar(64);
2055   const LLT S32 = LLT::scalar(32);
2056 
2057   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2058 
2059   unsigned Flags = MI.getFlags();
2060 
2061   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2062   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2063   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2064 
2065   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2066   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2067   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2068 
2069   auto Hi = Signed ?
2070     B.buildFPTOSI(S32, FloorMul) :
2071     B.buildFPTOUI(S32, FloorMul);
2072   auto Lo = B.buildFPTOUI(S32, Fma);
2073 
2074   B.buildMerge(Dst, { Lo, Hi });
2075   MI.eraseFromParent();
2076 
2077   return true;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2081                                                MachineInstr &MI) const {
2082   MachineFunction &MF = Helper.MIRBuilder.getMF();
2083   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2084 
2085   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2086                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2087 
2088   // With ieee_mode disabled, the instructions have the correct behavior
2089   // already for G_FMINNUM/G_FMAXNUM
2090   if (!MFI->getMode().IEEE)
2091     return !IsIEEEOp;
2092 
2093   if (IsIEEEOp)
2094     return true;
2095 
2096   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2097 }
2098 
2099 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2100   MachineInstr &MI, MachineRegisterInfo &MRI,
2101   MachineIRBuilder &B) const {
2102   // TODO: Should move some of this into LegalizerHelper.
2103 
2104   // TODO: Promote dynamic indexing of s16 to s32
2105 
2106   // FIXME: Artifact combiner probably should have replaced the truncated
2107   // constant before this, so we shouldn't need
2108   // getConstantVRegValWithLookThrough.
2109   Optional<ValueAndVReg> MaybeIdxVal =
2110       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2111   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2112     return true;
2113   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2114 
2115   Register Dst = MI.getOperand(0).getReg();
2116   Register Vec = MI.getOperand(1).getReg();
2117 
2118   LLT VecTy = MRI.getType(Vec);
2119   LLT EltTy = VecTy.getElementType();
2120   assert(EltTy == MRI.getType(Dst));
2121 
2122   if (IdxVal < VecTy.getNumElements())
2123     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2124   else
2125     B.buildUndef(Dst);
2126 
2127   MI.eraseFromParent();
2128   return true;
2129 }
2130 
2131 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2132   MachineInstr &MI, MachineRegisterInfo &MRI,
2133   MachineIRBuilder &B) const {
2134   // TODO: Should move some of this into LegalizerHelper.
2135 
2136   // TODO: Promote dynamic indexing of s16 to s32
2137 
2138   // FIXME: Artifact combiner probably should have replaced the truncated
2139   // constant before this, so we shouldn't need
2140   // getConstantVRegValWithLookThrough.
2141   Optional<ValueAndVReg> MaybeIdxVal =
2142       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2143   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2144     return true;
2145 
2146   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2147   Register Dst = MI.getOperand(0).getReg();
2148   Register Vec = MI.getOperand(1).getReg();
2149   Register Ins = MI.getOperand(2).getReg();
2150 
2151   LLT VecTy = MRI.getType(Vec);
2152   LLT EltTy = VecTy.getElementType();
2153   assert(EltTy == MRI.getType(Ins));
2154 
2155   if (IdxVal < VecTy.getNumElements())
2156     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2157   else
2158     B.buildUndef(Dst);
2159 
2160   MI.eraseFromParent();
2161   return true;
2162 }
2163 
2164 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2165   MachineInstr &MI, MachineRegisterInfo &MRI,
2166   MachineIRBuilder &B) const {
2167   const LLT V2S16 = LLT::vector(2, 16);
2168 
2169   Register Dst = MI.getOperand(0).getReg();
2170   Register Src0 = MI.getOperand(1).getReg();
2171   LLT DstTy = MRI.getType(Dst);
2172   LLT SrcTy = MRI.getType(Src0);
2173 
2174   if (SrcTy == V2S16 && DstTy == V2S16 &&
2175       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2176     return true;
2177 
2178   MachineIRBuilder HelperBuilder(MI);
2179   GISelObserverWrapper DummyObserver;
2180   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2181   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizeSinCos(
2185   MachineInstr &MI, MachineRegisterInfo &MRI,
2186   MachineIRBuilder &B) const {
2187 
2188   Register DstReg = MI.getOperand(0).getReg();
2189   Register SrcReg = MI.getOperand(1).getReg();
2190   LLT Ty = MRI.getType(DstReg);
2191   unsigned Flags = MI.getFlags();
2192 
2193   Register TrigVal;
2194   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2195   if (ST.hasTrigReducedRange()) {
2196     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2197     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2198       .addUse(MulVal.getReg(0))
2199       .setMIFlags(Flags).getReg(0);
2200   } else
2201     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2202 
2203   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2204     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2205   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2206     .addUse(TrigVal)
2207     .setMIFlags(Flags);
2208   MI.eraseFromParent();
2209   return true;
2210 }
2211 
2212 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2213                                                   MachineIRBuilder &B,
2214                                                   const GlobalValue *GV,
2215                                                   int64_t Offset,
2216                                                   unsigned GAFlags) const {
2217   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2218   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2219   // to the following code sequence:
2220   //
2221   // For constant address space:
2222   //   s_getpc_b64 s[0:1]
2223   //   s_add_u32 s0, s0, $symbol
2224   //   s_addc_u32 s1, s1, 0
2225   //
2226   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2227   //   a fixup or relocation is emitted to replace $symbol with a literal
2228   //   constant, which is a pc-relative offset from the encoding of the $symbol
2229   //   operand to the global variable.
2230   //
2231   // For global address space:
2232   //   s_getpc_b64 s[0:1]
2233   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2234   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2235   //
2236   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2237   //   fixups or relocations are emitted to replace $symbol@*@lo and
2238   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2239   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2240   //   operand to the global variable.
2241   //
2242   // What we want here is an offset from the value returned by s_getpc
2243   // (which is the address of the s_add_u32 instruction) to the global
2244   // variable, but since the encoding of $symbol starts 4 bytes after the start
2245   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2246   // small. This requires us to add 4 to the global variable offset in order to
2247   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2248   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2249   // instruction.
2250 
2251   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2252 
2253   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2254     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2255 
2256   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2257     .addDef(PCReg);
2258 
2259   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2260   if (GAFlags == SIInstrInfo::MO_NONE)
2261     MIB.addImm(0);
2262   else
2263     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2264 
2265   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2266 
2267   if (PtrTy.getSizeInBits() == 32)
2268     B.buildExtract(DstReg, PCReg, 0);
2269   return true;
2270  }
2271 
2272 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2273   MachineInstr &MI, MachineRegisterInfo &MRI,
2274   MachineIRBuilder &B) const {
2275   Register DstReg = MI.getOperand(0).getReg();
2276   LLT Ty = MRI.getType(DstReg);
2277   unsigned AS = Ty.getAddressSpace();
2278 
2279   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2280   MachineFunction &MF = B.getMF();
2281   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2282 
2283   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2284     if (!MFI->isModuleEntryFunction()) {
2285       const Function &Fn = MF.getFunction();
2286       DiagnosticInfoUnsupported BadLDSDecl(
2287         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2288         DS_Warning);
2289       Fn.getContext().diagnose(BadLDSDecl);
2290 
2291       // We currently don't have a way to correctly allocate LDS objects that
2292       // aren't directly associated with a kernel. We do force inlining of
2293       // functions that use local objects. However, if these dead functions are
2294       // not eliminated, we don't want a compile time error. Just emit a warning
2295       // and a trap, since there should be no callable path here.
2296       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2297       B.buildUndef(DstReg);
2298       MI.eraseFromParent();
2299       return true;
2300     }
2301 
2302     // TODO: We could emit code to handle the initialization somewhere.
2303     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2304       const SITargetLowering *TLI = ST.getTargetLowering();
2305       if (!TLI->shouldUseLDSConstAddress(GV)) {
2306         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2307         return true; // Leave in place;
2308       }
2309 
2310       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2311         Type *Ty = GV->getValueType();
2312         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2313         // zero-sized type in other languages to declare the dynamic shared
2314         // memory which size is not known at the compile time. They will be
2315         // allocated by the runtime and placed directly after the static
2316         // allocated ones. They all share the same offset.
2317         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2318           // Adjust alignment for that dynamic shared memory array.
2319           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2320           LLT S32 = LLT::scalar(32);
2321           auto Sz =
2322               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2323           B.buildIntToPtr(DstReg, Sz);
2324           MI.eraseFromParent();
2325           return true;
2326         }
2327       }
2328 
2329       B.buildConstant(
2330           DstReg,
2331           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2332       MI.eraseFromParent();
2333       return true;
2334     }
2335 
2336     const Function &Fn = MF.getFunction();
2337     DiagnosticInfoUnsupported BadInit(
2338       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2339     Fn.getContext().diagnose(BadInit);
2340     return true;
2341   }
2342 
2343   const SITargetLowering *TLI = ST.getTargetLowering();
2344 
2345   if (TLI->shouldEmitFixup(GV)) {
2346     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2347     MI.eraseFromParent();
2348     return true;
2349   }
2350 
2351   if (TLI->shouldEmitPCReloc(GV)) {
2352     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2353     MI.eraseFromParent();
2354     return true;
2355   }
2356 
2357   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2358   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2359 
2360   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2361       MachinePointerInfo::getGOT(MF),
2362       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2363           MachineMemOperand::MOInvariant,
2364       8 /*Size*/, Align(8));
2365 
2366   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2367 
2368   if (Ty.getSizeInBits() == 32) {
2369     // Truncate if this is a 32-bit constant adrdess.
2370     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2371     B.buildExtract(DstReg, Load, 0);
2372   } else
2373     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2374 
2375   MI.eraseFromParent();
2376   return true;
2377 }
2378 
2379 static LLT widenToNextPowerOf2(LLT Ty) {
2380   if (Ty.isVector())
2381     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2382   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2383 }
2384 
2385 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2386                                        MachineInstr &MI) const {
2387   MachineIRBuilder &B = Helper.MIRBuilder;
2388   MachineRegisterInfo &MRI = *B.getMRI();
2389   GISelChangeObserver &Observer = Helper.Observer;
2390 
2391   Register PtrReg = MI.getOperand(1).getReg();
2392   LLT PtrTy = MRI.getType(PtrReg);
2393   unsigned AddrSpace = PtrTy.getAddressSpace();
2394 
2395   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2396     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2397     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2398     Observer.changingInstr(MI);
2399     MI.getOperand(1).setReg(Cast.getReg(0));
2400     Observer.changedInstr(MI);
2401     return true;
2402   }
2403 
2404   Register ValReg = MI.getOperand(0).getReg();
2405   LLT ValTy = MRI.getType(ValReg);
2406 
2407   MachineMemOperand *MMO = *MI.memoperands_begin();
2408   const unsigned ValSize = ValTy.getSizeInBits();
2409   const unsigned MemSize = 8 * MMO->getSize();
2410   const Align MemAlign = MMO->getAlign();
2411   const unsigned AlignInBits = 8 * MemAlign.value();
2412 
2413   // Widen non-power-of-2 loads to the alignment if needed
2414   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2415     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2416 
2417     // This was already the correct extending load result type, so just adjust
2418     // the memory type.
2419     if (WideMemSize == ValSize) {
2420       MachineFunction &MF = B.getMF();
2421 
2422       MachineMemOperand *WideMMO =
2423           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2424       Observer.changingInstr(MI);
2425       MI.setMemRefs(MF, {WideMMO});
2426       Observer.changedInstr(MI);
2427       return true;
2428     }
2429 
2430     // Don't bother handling edge case that should probably never be produced.
2431     if (ValSize > WideMemSize)
2432       return false;
2433 
2434     LLT WideTy = widenToNextPowerOf2(ValTy);
2435 
2436     Register WideLoad;
2437     if (!WideTy.isVector()) {
2438       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2439       B.buildTrunc(ValReg, WideLoad).getReg(0);
2440     } else {
2441       // Extract the subvector.
2442 
2443       if (isRegisterType(ValTy)) {
2444         // If this a case where G_EXTRACT is legal, use it.
2445         // (e.g. <3 x s32> -> <4 x s32>)
2446         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2447         B.buildExtract(ValReg, WideLoad, 0);
2448       } else {
2449         // For cases where the widened type isn't a nice register value, unmerge
2450         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2451         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2452         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2453         B.setInsertPt(B.getMBB(), MI.getIterator());
2454         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2455       }
2456     }
2457 
2458     MI.eraseFromParent();
2459     return true;
2460   }
2461 
2462   return false;
2463 }
2464 
2465 bool AMDGPULegalizerInfo::legalizeFMad(
2466   MachineInstr &MI, MachineRegisterInfo &MRI,
2467   MachineIRBuilder &B) const {
2468   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2469   assert(Ty.isScalar());
2470 
2471   MachineFunction &MF = B.getMF();
2472   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2473 
2474   // TODO: Always legal with future ftz flag.
2475   // FIXME: Do we need just output?
2476   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2477     return true;
2478   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2479     return true;
2480 
2481   MachineIRBuilder HelperBuilder(MI);
2482   GISelObserverWrapper DummyObserver;
2483   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2484   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2485 }
2486 
2487 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2488   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2489   Register DstReg = MI.getOperand(0).getReg();
2490   Register PtrReg = MI.getOperand(1).getReg();
2491   Register CmpVal = MI.getOperand(2).getReg();
2492   Register NewVal = MI.getOperand(3).getReg();
2493 
2494   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2495          "this should not have been custom lowered");
2496 
2497   LLT ValTy = MRI.getType(CmpVal);
2498   LLT VecTy = LLT::vector(2, ValTy);
2499 
2500   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2501 
2502   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2503     .addDef(DstReg)
2504     .addUse(PtrReg)
2505     .addUse(PackedVal)
2506     .setMemRefs(MI.memoperands());
2507 
2508   MI.eraseFromParent();
2509   return true;
2510 }
2511 
2512 bool AMDGPULegalizerInfo::legalizeFlog(
2513   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2514   Register Dst = MI.getOperand(0).getReg();
2515   Register Src = MI.getOperand(1).getReg();
2516   LLT Ty = B.getMRI()->getType(Dst);
2517   unsigned Flags = MI.getFlags();
2518 
2519   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2520   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2521 
2522   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2523   MI.eraseFromParent();
2524   return true;
2525 }
2526 
2527 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2528                                        MachineIRBuilder &B) const {
2529   Register Dst = MI.getOperand(0).getReg();
2530   Register Src = MI.getOperand(1).getReg();
2531   unsigned Flags = MI.getFlags();
2532   LLT Ty = B.getMRI()->getType(Dst);
2533 
2534   auto K = B.buildFConstant(Ty, numbers::log2e);
2535   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2536   B.buildFExp2(Dst, Mul, Flags);
2537   MI.eraseFromParent();
2538   return true;
2539 }
2540 
2541 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2542                                        MachineIRBuilder &B) const {
2543   Register Dst = MI.getOperand(0).getReg();
2544   Register Src0 = MI.getOperand(1).getReg();
2545   Register Src1 = MI.getOperand(2).getReg();
2546   unsigned Flags = MI.getFlags();
2547   LLT Ty = B.getMRI()->getType(Dst);
2548   const LLT S16 = LLT::scalar(16);
2549   const LLT S32 = LLT::scalar(32);
2550 
2551   if (Ty == S32) {
2552     auto Log = B.buildFLog2(S32, Src0, Flags);
2553     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2554       .addUse(Log.getReg(0))
2555       .addUse(Src1)
2556       .setMIFlags(Flags);
2557     B.buildFExp2(Dst, Mul, Flags);
2558   } else if (Ty == S16) {
2559     // There's no f16 fmul_legacy, so we need to convert for it.
2560     auto Log = B.buildFLog2(S16, Src0, Flags);
2561     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2562     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2563     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2564       .addUse(Ext0.getReg(0))
2565       .addUse(Ext1.getReg(0))
2566       .setMIFlags(Flags);
2567 
2568     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2569   } else
2570     return false;
2571 
2572   MI.eraseFromParent();
2573   return true;
2574 }
2575 
2576 // Find a source register, ignoring any possible source modifiers.
2577 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2578   Register ModSrc = OrigSrc;
2579   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2580     ModSrc = SrcFNeg->getOperand(1).getReg();
2581     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2582       ModSrc = SrcFAbs->getOperand(1).getReg();
2583   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2584     ModSrc = SrcFAbs->getOperand(1).getReg();
2585   return ModSrc;
2586 }
2587 
2588 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2589                                          MachineRegisterInfo &MRI,
2590                                          MachineIRBuilder &B) const {
2591 
2592   const LLT S1 = LLT::scalar(1);
2593   const LLT S64 = LLT::scalar(64);
2594   Register Dst = MI.getOperand(0).getReg();
2595   Register OrigSrc = MI.getOperand(1).getReg();
2596   unsigned Flags = MI.getFlags();
2597   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2598          "this should not have been custom lowered");
2599 
2600   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2601   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2602   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2603   // V_FRACT bug is:
2604   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2605   //
2606   // Convert floor(x) to (x - fract(x))
2607 
2608   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2609     .addUse(OrigSrc)
2610     .setMIFlags(Flags);
2611 
2612   // Give source modifier matching some assistance before obscuring a foldable
2613   // pattern.
2614 
2615   // TODO: We can avoid the neg on the fract? The input sign to fract
2616   // shouldn't matter?
2617   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2618 
2619   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2620 
2621   Register Min = MRI.createGenericVirtualRegister(S64);
2622 
2623   // We don't need to concern ourselves with the snan handling difference, so
2624   // use the one which will directly select.
2625   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2626   if (MFI->getMode().IEEE)
2627     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2628   else
2629     B.buildFMinNum(Min, Fract, Const, Flags);
2630 
2631   Register CorrectedFract = Min;
2632   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2633     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2634     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2635   }
2636 
2637   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2638   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2639 
2640   MI.eraseFromParent();
2641   return true;
2642 }
2643 
2644 // Turn an illegal packed v2s16 build vector into bit operations.
2645 // TODO: This should probably be a bitcast action in LegalizerHelper.
2646 bool AMDGPULegalizerInfo::legalizeBuildVector(
2647   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2648   Register Dst = MI.getOperand(0).getReg();
2649   const LLT S32 = LLT::scalar(32);
2650   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2651 
2652   Register Src0 = MI.getOperand(1).getReg();
2653   Register Src1 = MI.getOperand(2).getReg();
2654   assert(MRI.getType(Src0) == LLT::scalar(16));
2655 
2656   auto Merge = B.buildMerge(S32, {Src0, Src1});
2657   B.buildBitcast(Dst, Merge);
2658 
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 // Check that this is a G_XOR x, -1
2664 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2665   if (MI.getOpcode() != TargetOpcode::G_XOR)
2666     return false;
2667   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2668   return ConstVal && *ConstVal == -1;
2669 }
2670 
2671 // Return the use branch instruction, otherwise null if the usage is invalid.
2672 static MachineInstr *
2673 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2674                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2675   Register CondDef = MI.getOperand(0).getReg();
2676   if (!MRI.hasOneNonDBGUse(CondDef))
2677     return nullptr;
2678 
2679   MachineBasicBlock *Parent = MI.getParent();
2680   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2681 
2682   if (isNot(MRI, *UseMI)) {
2683     Register NegatedCond = UseMI->getOperand(0).getReg();
2684     if (!MRI.hasOneNonDBGUse(NegatedCond))
2685       return nullptr;
2686 
2687     // We're deleting the def of this value, so we need to remove it.
2688     UseMI->eraseFromParent();
2689 
2690     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2691     Negated = true;
2692   }
2693 
2694   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2695     return nullptr;
2696 
2697   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2698   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2699   if (Next == Parent->end()) {
2700     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2701     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2702       return nullptr;
2703     UncondBrTarget = &*NextMBB;
2704   } else {
2705     if (Next->getOpcode() != AMDGPU::G_BR)
2706       return nullptr;
2707     Br = &*Next;
2708     UncondBrTarget = Br->getOperand(0).getMBB();
2709   }
2710 
2711   return UseMI;
2712 }
2713 
2714 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2715                                          const ArgDescriptor *Arg,
2716                                          const TargetRegisterClass *ArgRC,
2717                                          LLT ArgTy) const {
2718   MCRegister SrcReg = Arg->getRegister();
2719   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2720   assert(DstReg.isVirtual() && "Virtual register expected");
2721 
2722   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2723                                              ArgTy);
2724   if (Arg->isMasked()) {
2725     // TODO: Should we try to emit this once in the entry block?
2726     const LLT S32 = LLT::scalar(32);
2727     const unsigned Mask = Arg->getMask();
2728     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2729 
2730     Register AndMaskSrc = LiveIn;
2731 
2732     if (Shift != 0) {
2733       auto ShiftAmt = B.buildConstant(S32, Shift);
2734       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2735     }
2736 
2737     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2738   } else {
2739     B.buildCopy(DstReg, LiveIn);
2740   }
2741 
2742   return true;
2743 }
2744 
2745 bool AMDGPULegalizerInfo::loadInputValue(
2746     Register DstReg, MachineIRBuilder &B,
2747     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2748   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2749   const ArgDescriptor *Arg;
2750   const TargetRegisterClass *ArgRC;
2751   LLT ArgTy;
2752   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2753 
2754   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2755     return false; // TODO: Handle these
2756   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2757 }
2758 
2759 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2760     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2761     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2762   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2763     return false;
2764 
2765   MI.eraseFromParent();
2766   return true;
2767 }
2768 
2769 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2770                                        MachineRegisterInfo &MRI,
2771                                        MachineIRBuilder &B) const {
2772   Register Dst = MI.getOperand(0).getReg();
2773   LLT DstTy = MRI.getType(Dst);
2774   LLT S16 = LLT::scalar(16);
2775   LLT S32 = LLT::scalar(32);
2776   LLT S64 = LLT::scalar(64);
2777 
2778   if (DstTy == S16)
2779     return legalizeFDIV16(MI, MRI, B);
2780   if (DstTy == S32)
2781     return legalizeFDIV32(MI, MRI, B);
2782   if (DstTy == S64)
2783     return legalizeFDIV64(MI, MRI, B);
2784 
2785   return false;
2786 }
2787 
2788 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2789                                                   Register DstReg,
2790                                                   Register X,
2791                                                   Register Y,
2792                                                   bool IsDiv) const {
2793   const LLT S1 = LLT::scalar(1);
2794   const LLT S32 = LLT::scalar(32);
2795 
2796   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2797   // algorithm used here.
2798 
2799   // Initial estimate of inv(y).
2800   auto FloatY = B.buildUITOFP(S32, Y);
2801   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2802   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2803   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2804   auto Z = B.buildFPTOUI(S32, ScaledY);
2805 
2806   // One round of UNR.
2807   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2808   auto NegYZ = B.buildMul(S32, NegY, Z);
2809   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2810 
2811   // Quotient/remainder estimate.
2812   auto Q = B.buildUMulH(S32, X, Z);
2813   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2814 
2815   // First quotient/remainder refinement.
2816   auto One = B.buildConstant(S32, 1);
2817   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2818   if (IsDiv)
2819     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2820   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2821 
2822   // Second quotient/remainder refinement.
2823   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2824   if (IsDiv)
2825     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2826   else
2827     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2828 }
2829 
2830 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2831                                               MachineRegisterInfo &MRI,
2832                                               MachineIRBuilder &B) const {
2833   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2834   Register DstReg = MI.getOperand(0).getReg();
2835   Register Num = MI.getOperand(1).getReg();
2836   Register Den = MI.getOperand(2).getReg();
2837   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2838   MI.eraseFromParent();
2839   return true;
2840 }
2841 
2842 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2843 //
2844 // Return lo, hi of result
2845 //
2846 // %cvt.lo = G_UITOFP Val.lo
2847 // %cvt.hi = G_UITOFP Val.hi
2848 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2849 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2850 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2851 // %mul2 = G_FMUL %mul1, 2**(-32)
2852 // %trunc = G_INTRINSIC_TRUNC %mul2
2853 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2854 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2855 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2856                                                        Register Val) {
2857   const LLT S32 = LLT::scalar(32);
2858   auto Unmerge = B.buildUnmerge(S32, Val);
2859 
2860   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2861   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2862 
2863   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2864                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2865 
2866   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2867   auto Mul1 =
2868       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2869 
2870   // 2**(-32)
2871   auto Mul2 =
2872       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2873   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2874 
2875   // -(2**32)
2876   auto Mad2 = B.buildFMAD(S32, Trunc,
2877                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2878 
2879   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2880   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2881 
2882   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2883 }
2884 
2885 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2886                                                   Register DstReg,
2887                                                   Register Numer,
2888                                                   Register Denom,
2889                                                   bool IsDiv) const {
2890   const LLT S32 = LLT::scalar(32);
2891   const LLT S64 = LLT::scalar(64);
2892   const LLT S1 = LLT::scalar(1);
2893   Register RcpLo, RcpHi;
2894 
2895   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2896 
2897   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2898 
2899   auto Zero64 = B.buildConstant(S64, 0);
2900   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2901 
2902   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2903   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2904 
2905   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2906   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2907   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2908 
2909   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2910   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2911   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2912   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2913 
2914   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2915   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2916   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2917   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2918   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2919 
2920   auto Zero32 = B.buildConstant(S32, 0);
2921   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2922   auto Add2_HiC =
2923       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2924   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2925   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2926 
2927   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2928   Register NumerLo = UnmergeNumer.getReg(0);
2929   Register NumerHi = UnmergeNumer.getReg(1);
2930 
2931   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2932   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2933   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2934   Register Mul3_Lo = UnmergeMul3.getReg(0);
2935   Register Mul3_Hi = UnmergeMul3.getReg(1);
2936   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2937   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2938   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2939   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2940 
2941   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2942   Register DenomLo = UnmergeDenom.getReg(0);
2943   Register DenomHi = UnmergeDenom.getReg(1);
2944 
2945   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2946   auto C1 = B.buildSExt(S32, CmpHi);
2947 
2948   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2949   auto C2 = B.buildSExt(S32, CmpLo);
2950 
2951   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2952   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2953 
2954   // TODO: Here and below portions of the code can be enclosed into if/endif.
2955   // Currently control flow is unconditional and we have 4 selects after
2956   // potential endif to substitute PHIs.
2957 
2958   // if C3 != 0 ...
2959   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2960   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2961   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2962   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2963 
2964   auto One64 = B.buildConstant(S64, 1);
2965   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2966 
2967   auto C4 =
2968       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2969   auto C5 =
2970       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2971   auto C6 = B.buildSelect(
2972       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2973 
2974   // if (C6 != 0)
2975   auto Add4 = B.buildAdd(S64, Add3, One64);
2976   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2977 
2978   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2979   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2980   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2981 
2982   // endif C6
2983   // endif C3
2984 
2985   if (IsDiv) {
2986     auto Sel1 = B.buildSelect(
2987         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2988     B.buildSelect(DstReg,
2989                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2990   } else {
2991     auto Sel2 = B.buildSelect(
2992         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2993     B.buildSelect(DstReg,
2994                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2995   }
2996 }
2997 
2998 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2999                                             MachineRegisterInfo &MRI,
3000                                             MachineIRBuilder &B) const {
3001   const LLT S64 = LLT::scalar(64);
3002   const LLT S32 = LLT::scalar(32);
3003   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
3004   Register DstReg = MI.getOperand(0).getReg();
3005   Register Num = MI.getOperand(1).getReg();
3006   Register Den = MI.getOperand(2).getReg();
3007   LLT Ty = MRI.getType(DstReg);
3008 
3009   if (Ty == S32)
3010     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3011   else if (Ty == S64)
3012     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3013   else
3014     return false;
3015 
3016   MI.eraseFromParent();
3017   return true;
3018 
3019 }
3020 
3021 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3022                                             MachineRegisterInfo &MRI,
3023                                             MachineIRBuilder &B) const {
3024   const LLT S64 = LLT::scalar(64);
3025   const LLT S32 = LLT::scalar(32);
3026 
3027   Register DstReg = MI.getOperand(0).getReg();
3028   const LLT Ty = MRI.getType(DstReg);
3029   if (Ty != S32 && Ty != S64)
3030     return false;
3031 
3032   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3033 
3034   Register LHS = MI.getOperand(1).getReg();
3035   Register RHS = MI.getOperand(2).getReg();
3036 
3037   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3038   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3039   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3040 
3041   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3042   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3043 
3044   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3045   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3046 
3047   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3048   if (Ty == S32)
3049     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3050   else
3051     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3052 
3053   Register Sign;
3054   if (IsDiv)
3055     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3056   else
3057     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3058 
3059   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3060   B.buildSub(DstReg, UDivRem, Sign);
3061 
3062   MI.eraseFromParent();
3063   return true;
3064 }
3065 
3066 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3067                                                  MachineRegisterInfo &MRI,
3068                                                  MachineIRBuilder &B) const {
3069   Register Res = MI.getOperand(0).getReg();
3070   Register LHS = MI.getOperand(1).getReg();
3071   Register RHS = MI.getOperand(2).getReg();
3072   uint16_t Flags = MI.getFlags();
3073   LLT ResTy = MRI.getType(Res);
3074 
3075   const MachineFunction &MF = B.getMF();
3076   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3077                             MI.getFlag(MachineInstr::FmAfn);
3078 
3079   if (!AllowInaccurateRcp)
3080     return false;
3081 
3082   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3083     // 1 / x -> RCP(x)
3084     if (CLHS->isExactlyValue(1.0)) {
3085       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3086         .addUse(RHS)
3087         .setMIFlags(Flags);
3088 
3089       MI.eraseFromParent();
3090       return true;
3091     }
3092 
3093     // -1 / x -> RCP( FNEG(x) )
3094     if (CLHS->isExactlyValue(-1.0)) {
3095       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3096       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3097         .addUse(FNeg.getReg(0))
3098         .setMIFlags(Flags);
3099 
3100       MI.eraseFromParent();
3101       return true;
3102     }
3103   }
3104 
3105   // x / y -> x * (1.0 / y)
3106   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3107     .addUse(RHS)
3108     .setMIFlags(Flags);
3109   B.buildFMul(Res, LHS, RCP, Flags);
3110 
3111   MI.eraseFromParent();
3112   return true;
3113 }
3114 
3115 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3116                                                    MachineRegisterInfo &MRI,
3117                                                    MachineIRBuilder &B) const {
3118   Register Res = MI.getOperand(0).getReg();
3119   Register X = MI.getOperand(1).getReg();
3120   Register Y = MI.getOperand(2).getReg();
3121   uint16_t Flags = MI.getFlags();
3122   LLT ResTy = MRI.getType(Res);
3123 
3124   const MachineFunction &MF = B.getMF();
3125   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3126                             MI.getFlag(MachineInstr::FmAfn);
3127 
3128   if (!AllowInaccurateRcp)
3129     return false;
3130 
3131   auto NegY = B.buildFNeg(ResTy, Y);
3132   auto One = B.buildFConstant(ResTy, 1.0);
3133 
3134   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3135     .addUse(Y)
3136     .setMIFlags(Flags);
3137 
3138   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3139   R = B.buildFMA(ResTy, Tmp0, R, R);
3140 
3141   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3142   R = B.buildFMA(ResTy, Tmp1, R, R);
3143 
3144   auto Ret = B.buildFMul(ResTy, X, R);
3145   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3146 
3147   B.buildFMA(Res, Tmp2, R, Ret);
3148   MI.eraseFromParent();
3149   return true;
3150 }
3151 
3152 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3153                                          MachineRegisterInfo &MRI,
3154                                          MachineIRBuilder &B) const {
3155   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3156     return true;
3157 
3158   Register Res = MI.getOperand(0).getReg();
3159   Register LHS = MI.getOperand(1).getReg();
3160   Register RHS = MI.getOperand(2).getReg();
3161 
3162   uint16_t Flags = MI.getFlags();
3163 
3164   LLT S16 = LLT::scalar(16);
3165   LLT S32 = LLT::scalar(32);
3166 
3167   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3168   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3169 
3170   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3171     .addUse(RHSExt.getReg(0))
3172     .setMIFlags(Flags);
3173 
3174   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3175   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3176 
3177   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3178     .addUse(RDst.getReg(0))
3179     .addUse(RHS)
3180     .addUse(LHS)
3181     .setMIFlags(Flags);
3182 
3183   MI.eraseFromParent();
3184   return true;
3185 }
3186 
3187 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3188 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3189 static void toggleSPDenormMode(bool Enable,
3190                                MachineIRBuilder &B,
3191                                const GCNSubtarget &ST,
3192                                AMDGPU::SIModeRegisterDefaults Mode) {
3193   // Set SP denorm mode to this value.
3194   unsigned SPDenormMode =
3195     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3196 
3197   if (ST.hasDenormModeInst()) {
3198     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3199     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3200 
3201     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3202     B.buildInstr(AMDGPU::S_DENORM_MODE)
3203       .addImm(NewDenormModeValue);
3204 
3205   } else {
3206     // Select FP32 bit field in mode register.
3207     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3208                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3209                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3210 
3211     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3212       .addImm(SPDenormMode)
3213       .addImm(SPDenormModeBitField);
3214   }
3215 }
3216 
3217 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3218                                          MachineRegisterInfo &MRI,
3219                                          MachineIRBuilder &B) const {
3220   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3221     return true;
3222 
3223   Register Res = MI.getOperand(0).getReg();
3224   Register LHS = MI.getOperand(1).getReg();
3225   Register RHS = MI.getOperand(2).getReg();
3226   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3227   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3228 
3229   uint16_t Flags = MI.getFlags();
3230 
3231   LLT S32 = LLT::scalar(32);
3232   LLT S1 = LLT::scalar(1);
3233 
3234   auto One = B.buildFConstant(S32, 1.0f);
3235 
3236   auto DenominatorScaled =
3237     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3238       .addUse(LHS)
3239       .addUse(RHS)
3240       .addImm(0)
3241       .setMIFlags(Flags);
3242   auto NumeratorScaled =
3243     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3244       .addUse(LHS)
3245       .addUse(RHS)
3246       .addImm(1)
3247       .setMIFlags(Flags);
3248 
3249   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3250     .addUse(DenominatorScaled.getReg(0))
3251     .setMIFlags(Flags);
3252   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3253 
3254   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3255   // aren't modeled as reading it.
3256   if (!Mode.allFP32Denormals())
3257     toggleSPDenormMode(true, B, ST, Mode);
3258 
3259   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3260   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3261   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3262   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3263   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3264   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3265 
3266   if (!Mode.allFP32Denormals())
3267     toggleSPDenormMode(false, B, ST, Mode);
3268 
3269   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3270     .addUse(Fma4.getReg(0))
3271     .addUse(Fma1.getReg(0))
3272     .addUse(Fma3.getReg(0))
3273     .addUse(NumeratorScaled.getReg(1))
3274     .setMIFlags(Flags);
3275 
3276   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3277     .addUse(Fmas.getReg(0))
3278     .addUse(RHS)
3279     .addUse(LHS)
3280     .setMIFlags(Flags);
3281 
3282   MI.eraseFromParent();
3283   return true;
3284 }
3285 
3286 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3287                                          MachineRegisterInfo &MRI,
3288                                          MachineIRBuilder &B) const {
3289   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3290     return true;
3291 
3292   Register Res = MI.getOperand(0).getReg();
3293   Register LHS = MI.getOperand(1).getReg();
3294   Register RHS = MI.getOperand(2).getReg();
3295 
3296   uint16_t Flags = MI.getFlags();
3297 
3298   LLT S64 = LLT::scalar(64);
3299   LLT S1 = LLT::scalar(1);
3300 
3301   auto One = B.buildFConstant(S64, 1.0);
3302 
3303   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3304     .addUse(LHS)
3305     .addUse(RHS)
3306     .addImm(0)
3307     .setMIFlags(Flags);
3308 
3309   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3310 
3311   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3312     .addUse(DivScale0.getReg(0))
3313     .setMIFlags(Flags);
3314 
3315   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3316   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3317   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3318 
3319   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3320     .addUse(LHS)
3321     .addUse(RHS)
3322     .addImm(1)
3323     .setMIFlags(Flags);
3324 
3325   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3326   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3327   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3328 
3329   Register Scale;
3330   if (!ST.hasUsableDivScaleConditionOutput()) {
3331     // Workaround a hardware bug on SI where the condition output from div_scale
3332     // is not usable.
3333 
3334     LLT S32 = LLT::scalar(32);
3335 
3336     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3337     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3338     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3339     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3340 
3341     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3342                               Scale1Unmerge.getReg(1));
3343     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3344                               Scale0Unmerge.getReg(1));
3345     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3346   } else {
3347     Scale = DivScale1.getReg(1);
3348   }
3349 
3350   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3351     .addUse(Fma4.getReg(0))
3352     .addUse(Fma3.getReg(0))
3353     .addUse(Mul.getReg(0))
3354     .addUse(Scale)
3355     .setMIFlags(Flags);
3356 
3357   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3358     .addUse(Fmas.getReg(0))
3359     .addUse(RHS)
3360     .addUse(LHS)
3361     .setMIFlags(Flags);
3362 
3363   MI.eraseFromParent();
3364   return true;
3365 }
3366 
3367 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3368                                                  MachineRegisterInfo &MRI,
3369                                                  MachineIRBuilder &B) const {
3370   Register Res = MI.getOperand(0).getReg();
3371   Register LHS = MI.getOperand(2).getReg();
3372   Register RHS = MI.getOperand(3).getReg();
3373   uint16_t Flags = MI.getFlags();
3374 
3375   LLT S32 = LLT::scalar(32);
3376   LLT S1 = LLT::scalar(1);
3377 
3378   auto Abs = B.buildFAbs(S32, RHS, Flags);
3379   const APFloat C0Val(1.0f);
3380 
3381   auto C0 = B.buildConstant(S32, 0x6f800000);
3382   auto C1 = B.buildConstant(S32, 0x2f800000);
3383   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3384 
3385   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3386   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3387 
3388   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3389 
3390   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3391     .addUse(Mul0.getReg(0))
3392     .setMIFlags(Flags);
3393 
3394   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3395 
3396   B.buildFMul(Res, Sel, Mul1, Flags);
3397 
3398   MI.eraseFromParent();
3399   return true;
3400 }
3401 
3402 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3403 // FIXME: Why do we handle this one but not other removed instructions?
3404 //
3405 // Reciprocal square root.  The clamp prevents infinite results, clamping
3406 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3407 // +-max_float.
3408 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3409                                                     MachineRegisterInfo &MRI,
3410                                                     MachineIRBuilder &B) const {
3411   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3412     return true;
3413 
3414   Register Dst = MI.getOperand(0).getReg();
3415   Register Src = MI.getOperand(2).getReg();
3416   auto Flags = MI.getFlags();
3417 
3418   LLT Ty = MRI.getType(Dst);
3419 
3420   const fltSemantics *FltSemantics;
3421   if (Ty == LLT::scalar(32))
3422     FltSemantics = &APFloat::IEEEsingle();
3423   else if (Ty == LLT::scalar(64))
3424     FltSemantics = &APFloat::IEEEdouble();
3425   else
3426     return false;
3427 
3428   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3429     .addUse(Src)
3430     .setMIFlags(Flags);
3431 
3432   // We don't need to concern ourselves with the snan handling difference, since
3433   // the rsq quieted (or not) so use the one which will directly select.
3434   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3435   const bool UseIEEE = MFI->getMode().IEEE;
3436 
3437   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3438   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3439                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3440 
3441   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3442 
3443   if (UseIEEE)
3444     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3445   else
3446     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3447   MI.eraseFromParent();
3448   return true;
3449 }
3450 
3451 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3452   switch (IID) {
3453   case Intrinsic::amdgcn_ds_fadd:
3454     return AMDGPU::G_ATOMICRMW_FADD;
3455   case Intrinsic::amdgcn_ds_fmin:
3456     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3457   case Intrinsic::amdgcn_ds_fmax:
3458     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3459   default:
3460     llvm_unreachable("not a DS FP intrinsic");
3461   }
3462 }
3463 
3464 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3465                                                       MachineInstr &MI,
3466                                                       Intrinsic::ID IID) const {
3467   GISelChangeObserver &Observer = Helper.Observer;
3468   Observer.changingInstr(MI);
3469 
3470   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3471 
3472   // The remaining operands were used to set fields in the MemOperand on
3473   // construction.
3474   for (int I = 6; I > 3; --I)
3475     MI.RemoveOperand(I);
3476 
3477   MI.RemoveOperand(1); // Remove the intrinsic ID.
3478   Observer.changedInstr(MI);
3479   return true;
3480 }
3481 
3482 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3483                                             MachineRegisterInfo &MRI,
3484                                             MachineIRBuilder &B) const {
3485   uint64_t Offset =
3486     ST.getTargetLowering()->getImplicitParameterOffset(
3487       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3488   LLT DstTy = MRI.getType(DstReg);
3489   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3490 
3491   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3492   if (!loadInputValue(KernargPtrReg, B,
3493                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3494     return false;
3495 
3496   // FIXME: This should be nuw
3497   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3498   return true;
3499 }
3500 
3501 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3502                                                  MachineRegisterInfo &MRI,
3503                                                  MachineIRBuilder &B) const {
3504   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3505   if (!MFI->isEntryFunction()) {
3506     return legalizePreloadedArgIntrin(MI, MRI, B,
3507                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3508   }
3509 
3510   Register DstReg = MI.getOperand(0).getReg();
3511   if (!getImplicitArgPtr(DstReg, MRI, B))
3512     return false;
3513 
3514   MI.eraseFromParent();
3515   return true;
3516 }
3517 
3518 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3519                                               MachineRegisterInfo &MRI,
3520                                               MachineIRBuilder &B,
3521                                               unsigned AddrSpace) const {
3522   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3523   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3524   Register Hi32 = Unmerge.getReg(1);
3525 
3526   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3527   MI.eraseFromParent();
3528   return true;
3529 }
3530 
3531 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3532 // offset (the offset that is included in bounds checking and swizzling, to be
3533 // split between the instruction's voffset and immoffset fields) and soffset
3534 // (the offset that is excluded from bounds checking and swizzling, to go in
3535 // the instruction's soffset field).  This function takes the first kind of
3536 // offset and figures out how to split it between voffset and immoffset.
3537 std::tuple<Register, unsigned, unsigned>
3538 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3539                                         Register OrigOffset) const {
3540   const unsigned MaxImm = 4095;
3541   Register BaseReg;
3542   unsigned TotalConstOffset;
3543   const LLT S32 = LLT::scalar(32);
3544   MachineRegisterInfo &MRI = *B.getMRI();
3545 
3546   std::tie(BaseReg, TotalConstOffset) =
3547       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3548 
3549   unsigned ImmOffset = TotalConstOffset;
3550 
3551   // If BaseReg is a pointer, convert it to int.
3552   if (MRI.getType(BaseReg).isPointer())
3553     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3554 
3555   // If the immediate value is too big for the immoffset field, put the value
3556   // and -4096 into the immoffset field so that the value that is copied/added
3557   // for the voffset field is a multiple of 4096, and it stands more chance
3558   // of being CSEd with the copy/add for another similar load/store.
3559   // However, do not do that rounding down to a multiple of 4096 if that is a
3560   // negative number, as it appears to be illegal to have a negative offset
3561   // in the vgpr, even if adding the immediate offset makes it positive.
3562   unsigned Overflow = ImmOffset & ~MaxImm;
3563   ImmOffset -= Overflow;
3564   if ((int32_t)Overflow < 0) {
3565     Overflow += ImmOffset;
3566     ImmOffset = 0;
3567   }
3568 
3569   if (Overflow != 0) {
3570     if (!BaseReg) {
3571       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3572     } else {
3573       auto OverflowVal = B.buildConstant(S32, Overflow);
3574       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3575     }
3576   }
3577 
3578   if (!BaseReg)
3579     BaseReg = B.buildConstant(S32, 0).getReg(0);
3580 
3581   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3582 }
3583 
3584 /// Handle register layout difference for f16 images for some subtargets.
3585 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3586                                              MachineRegisterInfo &MRI,
3587                                              Register Reg,
3588                                              bool ImageStore) const {
3589   const LLT S16 = LLT::scalar(16);
3590   const LLT S32 = LLT::scalar(32);
3591   LLT StoreVT = MRI.getType(Reg);
3592   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3593 
3594   if (ST.hasUnpackedD16VMem()) {
3595     auto Unmerge = B.buildUnmerge(S16, Reg);
3596 
3597     SmallVector<Register, 4> WideRegs;
3598     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3599       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3600 
3601     int NumElts = StoreVT.getNumElements();
3602 
3603     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3604   }
3605 
3606   if (ImageStore && ST.hasImageStoreD16Bug()) {
3607     if (StoreVT.getNumElements() == 2) {
3608       SmallVector<Register, 4> PackedRegs;
3609       Reg = B.buildBitcast(S32, Reg).getReg(0);
3610       PackedRegs.push_back(Reg);
3611       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3612       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3613     }
3614 
3615     if (StoreVT.getNumElements() == 3) {
3616       SmallVector<Register, 4> PackedRegs;
3617       auto Unmerge = B.buildUnmerge(S16, Reg);
3618       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3619         PackedRegs.push_back(Unmerge.getReg(I));
3620       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3621       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3622       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3623     }
3624 
3625     if (StoreVT.getNumElements() == 4) {
3626       SmallVector<Register, 4> PackedRegs;
3627       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3628       auto Unmerge = B.buildUnmerge(S32, Reg);
3629       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3630         PackedRegs.push_back(Unmerge.getReg(I));
3631       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3632       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3633     }
3634 
3635     llvm_unreachable("invalid data type");
3636   }
3637 
3638   return Reg;
3639 }
3640 
3641 Register AMDGPULegalizerInfo::fixStoreSourceType(
3642   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3643   MachineRegisterInfo *MRI = B.getMRI();
3644   LLT Ty = MRI->getType(VData);
3645 
3646   const LLT S16 = LLT::scalar(16);
3647 
3648   // Fixup illegal register types for i8 stores.
3649   if (Ty == LLT::scalar(8) || Ty == S16) {
3650     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3651     return AnyExt;
3652   }
3653 
3654   if (Ty.isVector()) {
3655     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3656       if (IsFormat)
3657         return handleD16VData(B, *MRI, VData);
3658     }
3659   }
3660 
3661   return VData;
3662 }
3663 
3664 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3665                                               MachineRegisterInfo &MRI,
3666                                               MachineIRBuilder &B,
3667                                               bool IsTyped,
3668                                               bool IsFormat) const {
3669   Register VData = MI.getOperand(1).getReg();
3670   LLT Ty = MRI.getType(VData);
3671   LLT EltTy = Ty.getScalarType();
3672   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3673   const LLT S32 = LLT::scalar(32);
3674 
3675   VData = fixStoreSourceType(B, VData, IsFormat);
3676   Register RSrc = MI.getOperand(2).getReg();
3677 
3678   MachineMemOperand *MMO = *MI.memoperands_begin();
3679   const int MemSize = MMO->getSize();
3680 
3681   unsigned ImmOffset;
3682   unsigned TotalOffset;
3683 
3684   // The typed intrinsics add an immediate after the registers.
3685   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3686 
3687   // The struct intrinsic variants add one additional operand over raw.
3688   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3689   Register VIndex;
3690   int OpOffset = 0;
3691   if (HasVIndex) {
3692     VIndex = MI.getOperand(3).getReg();
3693     OpOffset = 1;
3694   }
3695 
3696   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3697   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3698 
3699   unsigned Format = 0;
3700   if (IsTyped) {
3701     Format = MI.getOperand(5 + OpOffset).getImm();
3702     ++OpOffset;
3703   }
3704 
3705   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3706 
3707   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3708   if (TotalOffset != 0)
3709     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3710 
3711   unsigned Opc;
3712   if (IsTyped) {
3713     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3714                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3715   } else if (IsFormat) {
3716     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3717                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3718   } else {
3719     switch (MemSize) {
3720     case 1:
3721       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3722       break;
3723     case 2:
3724       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3725       break;
3726     default:
3727       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3728       break;
3729     }
3730   }
3731 
3732   if (!VIndex)
3733     VIndex = B.buildConstant(S32, 0).getReg(0);
3734 
3735   auto MIB = B.buildInstr(Opc)
3736     .addUse(VData)              // vdata
3737     .addUse(RSrc)               // rsrc
3738     .addUse(VIndex)             // vindex
3739     .addUse(VOffset)            // voffset
3740     .addUse(SOffset)            // soffset
3741     .addImm(ImmOffset);         // offset(imm)
3742 
3743   if (IsTyped)
3744     MIB.addImm(Format);
3745 
3746   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3747      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3748      .addMemOperand(MMO);
3749 
3750   MI.eraseFromParent();
3751   return true;
3752 }
3753 
3754 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3755                                              MachineRegisterInfo &MRI,
3756                                              MachineIRBuilder &B,
3757                                              bool IsFormat,
3758                                              bool IsTyped) const {
3759   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3760   MachineMemOperand *MMO = *MI.memoperands_begin();
3761   const int MemSize = MMO->getSize();
3762   const LLT S32 = LLT::scalar(32);
3763 
3764   Register Dst = MI.getOperand(0).getReg();
3765   Register RSrc = MI.getOperand(2).getReg();
3766 
3767   // The typed intrinsics add an immediate after the registers.
3768   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3769 
3770   // The struct intrinsic variants add one additional operand over raw.
3771   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3772   Register VIndex;
3773   int OpOffset = 0;
3774   if (HasVIndex) {
3775     VIndex = MI.getOperand(3).getReg();
3776     OpOffset = 1;
3777   }
3778 
3779   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3780   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3781 
3782   unsigned Format = 0;
3783   if (IsTyped) {
3784     Format = MI.getOperand(5 + OpOffset).getImm();
3785     ++OpOffset;
3786   }
3787 
3788   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3789   unsigned ImmOffset;
3790   unsigned TotalOffset;
3791 
3792   LLT Ty = MRI.getType(Dst);
3793   LLT EltTy = Ty.getScalarType();
3794   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3795   const bool Unpacked = ST.hasUnpackedD16VMem();
3796 
3797   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3798   if (TotalOffset != 0)
3799     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3800 
3801   unsigned Opc;
3802 
3803   if (IsTyped) {
3804     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3805                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3806   } else if (IsFormat) {
3807     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3808                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3809   } else {
3810     switch (MemSize) {
3811     case 1:
3812       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3813       break;
3814     case 2:
3815       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3816       break;
3817     default:
3818       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3819       break;
3820     }
3821   }
3822 
3823   Register LoadDstReg;
3824 
3825   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3826   LLT UnpackedTy = Ty.changeElementSize(32);
3827 
3828   if (IsExtLoad)
3829     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3830   else if (Unpacked && IsD16 && Ty.isVector())
3831     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3832   else
3833     LoadDstReg = Dst;
3834 
3835   if (!VIndex)
3836     VIndex = B.buildConstant(S32, 0).getReg(0);
3837 
3838   auto MIB = B.buildInstr(Opc)
3839     .addDef(LoadDstReg)         // vdata
3840     .addUse(RSrc)               // rsrc
3841     .addUse(VIndex)             // vindex
3842     .addUse(VOffset)            // voffset
3843     .addUse(SOffset)            // soffset
3844     .addImm(ImmOffset);         // offset(imm)
3845 
3846   if (IsTyped)
3847     MIB.addImm(Format);
3848 
3849   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3850      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3851      .addMemOperand(MMO);
3852 
3853   if (LoadDstReg != Dst) {
3854     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3855 
3856     // Widen result for extending loads was widened.
3857     if (IsExtLoad)
3858       B.buildTrunc(Dst, LoadDstReg);
3859     else {
3860       // Repack to original 16-bit vector result
3861       // FIXME: G_TRUNC should work, but legalization currently fails
3862       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3863       SmallVector<Register, 4> Repack;
3864       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3865         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3866       B.buildMerge(Dst, Repack);
3867     }
3868   }
3869 
3870   MI.eraseFromParent();
3871   return true;
3872 }
3873 
3874 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3875                                                MachineIRBuilder &B,
3876                                                bool IsInc) const {
3877   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3878                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3879   B.buildInstr(Opc)
3880     .addDef(MI.getOperand(0).getReg())
3881     .addUse(MI.getOperand(2).getReg())
3882     .addUse(MI.getOperand(3).getReg())
3883     .cloneMemRefs(MI);
3884   MI.eraseFromParent();
3885   return true;
3886 }
3887 
3888 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3889   switch (IntrID) {
3890   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3891   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3892     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3893   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3894   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3895     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3896   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3897   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3898     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3899   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3900   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3901     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3902   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3903   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3904     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3905   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3906   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3907     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3908   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3909   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3910     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3911   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3912   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3913     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3914   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3915   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3916     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3917   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3918   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3919     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3920   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3921   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3922     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3923   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3924   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3925     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3926   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3927   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3928     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3929   case Intrinsic::amdgcn_buffer_atomic_fadd:
3930   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3931   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3932     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3933   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3934   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3935     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3936   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3937   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3938     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3939   default:
3940     llvm_unreachable("unhandled atomic opcode");
3941   }
3942 }
3943 
3944 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3945                                                MachineIRBuilder &B,
3946                                                Intrinsic::ID IID) const {
3947   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3948                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3949   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3950 
3951   Register Dst;
3952 
3953   int OpOffset = 0;
3954   if (HasReturn) {
3955     // A few FP atomics do not support return values.
3956     Dst = MI.getOperand(0).getReg();
3957   } else {
3958     OpOffset = -1;
3959   }
3960 
3961   Register VData = MI.getOperand(2 + OpOffset).getReg();
3962   Register CmpVal;
3963 
3964   if (IsCmpSwap) {
3965     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3966     ++OpOffset;
3967   }
3968 
3969   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3970   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3971 
3972   // The struct intrinsic variants add one additional operand over raw.
3973   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3974   Register VIndex;
3975   if (HasVIndex) {
3976     VIndex = MI.getOperand(4 + OpOffset).getReg();
3977     ++OpOffset;
3978   }
3979 
3980   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3981   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3982   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3983 
3984   MachineMemOperand *MMO = *MI.memoperands_begin();
3985 
3986   unsigned ImmOffset;
3987   unsigned TotalOffset;
3988   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3989   if (TotalOffset != 0)
3990     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3991 
3992   if (!VIndex)
3993     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3994 
3995   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3996 
3997   if (HasReturn)
3998     MIB.addDef(Dst);
3999 
4000   MIB.addUse(VData); // vdata
4001 
4002   if (IsCmpSwap)
4003     MIB.addReg(CmpVal);
4004 
4005   MIB.addUse(RSrc)               // rsrc
4006      .addUse(VIndex)             // vindex
4007      .addUse(VOffset)            // voffset
4008      .addUse(SOffset)            // soffset
4009      .addImm(ImmOffset)          // offset(imm)
4010      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4011      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4012      .addMemOperand(MMO);
4013 
4014   MI.eraseFromParent();
4015   return true;
4016 }
4017 
4018 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4019 /// vector with s16 typed elements.
4020 static void packImageA16AddressToDwords(
4021     MachineIRBuilder &B, MachineInstr &MI,
4022     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4023     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4024   const LLT S16 = LLT::scalar(16);
4025   const LLT V2S16 = LLT::vector(2, 16);
4026 
4027   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4028     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4029     if (!SrcOp.isReg())
4030       continue; // _L to _LZ may have eliminated this.
4031 
4032     Register AddrReg = SrcOp.getReg();
4033 
4034     if (I < Intr->GradientStart) {
4035       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4036       PackedAddrs.push_back(AddrReg);
4037     } else {
4038       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4039       // derivatives dx/dh and dx/dv are packed with undef.
4040       if (((I + 1) >= EndIdx) ||
4041           ((Intr->NumGradients / 2) % 2 == 1 &&
4042            (I == static_cast<unsigned>(Intr->GradientStart +
4043                                        (Intr->NumGradients / 2) - 1) ||
4044             I == static_cast<unsigned>(Intr->GradientStart +
4045                                        Intr->NumGradients - 1))) ||
4046           // Check for _L to _LZ optimization
4047           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4048         PackedAddrs.push_back(
4049             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4050                 .getReg(0));
4051       } else {
4052         PackedAddrs.push_back(
4053             B.buildBuildVector(
4054                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4055                 .getReg(0));
4056         ++I;
4057       }
4058     }
4059   }
4060 }
4061 
4062 /// Convert from separate vaddr components to a single vector address register,
4063 /// and replace the remaining operands with $noreg.
4064 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4065                                      int DimIdx, int NumVAddrs) {
4066   const LLT S32 = LLT::scalar(32);
4067 
4068   SmallVector<Register, 8> AddrRegs;
4069   for (int I = 0; I != NumVAddrs; ++I) {
4070     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4071     if (SrcOp.isReg()) {
4072       AddrRegs.push_back(SrcOp.getReg());
4073       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4074     }
4075   }
4076 
4077   int NumAddrRegs = AddrRegs.size();
4078   if (NumAddrRegs != 1) {
4079     // Round up to 8 elements for v5-v7
4080     // FIXME: Missing intermediate sized register classes and instructions.
4081     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4082       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4083       auto Undef = B.buildUndef(S32);
4084       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4085       NumAddrRegs = RoundedNumRegs;
4086     }
4087 
4088     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4089     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4090   }
4091 
4092   for (int I = 1; I != NumVAddrs; ++I) {
4093     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4094     if (SrcOp.isReg())
4095       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4096   }
4097 }
4098 
4099 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4100 ///
4101 /// Depending on the subtarget, load/store with 16-bit element data need to be
4102 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4103 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4104 /// registers.
4105 ///
4106 /// We don't want to directly select image instructions just yet, but also want
4107 /// to exposes all register repacking to the legalizer/combiners. We also don't
4108 /// want a selected instrution entering RegBankSelect. In order to avoid
4109 /// defining a multitude of intermediate image instructions, directly hack on
4110 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4111 /// now unnecessary arguments with $noreg.
4112 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4113     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4114     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4115 
4116   const unsigned NumDefs = MI.getNumExplicitDefs();
4117   const unsigned ArgOffset = NumDefs + 1;
4118   bool IsTFE = NumDefs == 2;
4119   // We are only processing the operands of d16 image operations on subtargets
4120   // that use the unpacked register layout, or need to repack the TFE result.
4121 
4122   // TODO: Do we need to guard against already legalized intrinsics?
4123   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4124       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4125 
4126   MachineRegisterInfo *MRI = B.getMRI();
4127   const LLT S32 = LLT::scalar(32);
4128   const LLT S16 = LLT::scalar(16);
4129   const LLT V2S16 = LLT::vector(2, 16);
4130 
4131   unsigned DMask = 0;
4132 
4133   // Check for 16 bit addresses and pack if true.
4134   LLT GradTy =
4135       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4136   LLT AddrTy =
4137       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4138   const bool IsG16 = GradTy == S16;
4139   const bool IsA16 = AddrTy == S16;
4140 
4141   int DMaskLanes = 0;
4142   if (!BaseOpcode->Atomic) {
4143     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4144     if (BaseOpcode->Gather4) {
4145       DMaskLanes = 4;
4146     } else if (DMask != 0) {
4147       DMaskLanes = countPopulation(DMask);
4148     } else if (!IsTFE && !BaseOpcode->Store) {
4149       // If dmask is 0, this is a no-op load. This can be eliminated.
4150       B.buildUndef(MI.getOperand(0));
4151       MI.eraseFromParent();
4152       return true;
4153     }
4154   }
4155 
4156   Observer.changingInstr(MI);
4157   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4158 
4159   unsigned NewOpcode = NumDefs == 0 ?
4160     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4161 
4162   // Track that we legalized this
4163   MI.setDesc(B.getTII().get(NewOpcode));
4164 
4165   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4166   // dmask to be at least 1 otherwise the instruction will fail
4167   if (IsTFE && DMask == 0) {
4168     DMask = 0x1;
4169     DMaskLanes = 1;
4170     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4171   }
4172 
4173   if (BaseOpcode->Atomic) {
4174     Register VData0 = MI.getOperand(2).getReg();
4175     LLT Ty = MRI->getType(VData0);
4176 
4177     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4178     if (Ty.isVector())
4179       return false;
4180 
4181     if (BaseOpcode->AtomicX2) {
4182       Register VData1 = MI.getOperand(3).getReg();
4183       // The two values are packed in one register.
4184       LLT PackedTy = LLT::vector(2, Ty);
4185       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4186       MI.getOperand(2).setReg(Concat.getReg(0));
4187       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4188     }
4189   }
4190 
4191   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4192 
4193   // Optimize _L to _LZ when _L is zero
4194   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4195           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4196     const ConstantFP *ConstantLod;
4197 
4198     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4199                  m_GFCst(ConstantLod))) {
4200       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4201         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4202         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4203             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4204                                                       Intr->Dim);
4205 
4206         // The starting indexes should remain in the same place.
4207         --CorrectedNumVAddrs;
4208 
4209         MI.getOperand(MI.getNumExplicitDefs())
4210             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4211         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4212         Intr = NewImageDimIntr;
4213       }
4214     }
4215   }
4216 
4217   // Optimize _mip away, when 'lod' is zero
4218   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4219     int64_t ConstantLod;
4220     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4221                  m_ICst(ConstantLod))) {
4222       if (ConstantLod == 0) {
4223         // TODO: Change intrinsic opcode and remove operand instead or replacing
4224         // it with 0, as the _L to _LZ handling is done above.
4225         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4226         --CorrectedNumVAddrs;
4227       }
4228     }
4229   }
4230 
4231   // Rewrite the addressing register layout before doing anything else.
4232   if (IsA16 || IsG16) {
4233     if (IsA16) {
4234       // Target must support the feature and gradients need to be 16 bit too
4235       if (!ST.hasA16() || !IsG16)
4236         return false;
4237     } else if (!ST.hasG16())
4238       return false;
4239 
4240     if (Intr->NumVAddrs > 1) {
4241       SmallVector<Register, 4> PackedRegs;
4242       // Don't compress addresses for G16
4243       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4244       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4245                                   PackEndIdx);
4246 
4247       if (!IsA16) {
4248         // Add uncompressed address
4249         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4250           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4251           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4252           PackedRegs.push_back(AddrReg);
4253         }
4254       }
4255 
4256       // See also below in the non-a16 branch
4257       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4258 
4259       if (!UseNSA && PackedRegs.size() > 1) {
4260         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4261         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4262         PackedRegs[0] = Concat.getReg(0);
4263         PackedRegs.resize(1);
4264       }
4265 
4266       const unsigned NumPacked = PackedRegs.size();
4267       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4268         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4269         if (!SrcOp.isReg()) {
4270           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4271           continue;
4272         }
4273 
4274         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4275 
4276         if (I - Intr->VAddrStart < NumPacked)
4277           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4278         else
4279           SrcOp.setReg(AMDGPU::NoRegister);
4280       }
4281     }
4282   } else {
4283     // If the register allocator cannot place the address registers contiguously
4284     // without introducing moves, then using the non-sequential address encoding
4285     // is always preferable, since it saves VALU instructions and is usually a
4286     // wash in terms of code size or even better.
4287     //
4288     // However, we currently have no way of hinting to the register allocator
4289     // that MIMG addresses should be placed contiguously when it is possible to
4290     // do so, so force non-NSA for the common 2-address case as a heuristic.
4291     //
4292     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4293     // allocation when possible.
4294     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4295 
4296     if (!UseNSA && Intr->NumVAddrs > 1)
4297       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4298                                Intr->NumVAddrs);
4299   }
4300 
4301   int Flags = 0;
4302   if (IsA16)
4303     Flags |= 1;
4304   if (IsG16)
4305     Flags |= 2;
4306   MI.addOperand(MachineOperand::CreateImm(Flags));
4307 
4308   if (BaseOpcode->Store) { // No TFE for stores?
4309     // TODO: Handle dmask trim
4310     Register VData = MI.getOperand(1).getReg();
4311     LLT Ty = MRI->getType(VData);
4312     if (!Ty.isVector() || Ty.getElementType() != S16)
4313       return true;
4314 
4315     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4316     if (RepackedReg != VData) {
4317       MI.getOperand(1).setReg(RepackedReg);
4318     }
4319 
4320     return true;
4321   }
4322 
4323   Register DstReg = MI.getOperand(0).getReg();
4324   LLT Ty = MRI->getType(DstReg);
4325   const LLT EltTy = Ty.getScalarType();
4326   const bool IsD16 = Ty.getScalarType() == S16;
4327   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4328 
4329   // Confirm that the return type is large enough for the dmask specified
4330   if (NumElts < DMaskLanes)
4331     return false;
4332 
4333   if (NumElts > 4 || DMaskLanes > 4)
4334     return false;
4335 
4336   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4337   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4338 
4339   // The raw dword aligned data component of the load. The only legal cases
4340   // where this matters should be when using the packed D16 format, for
4341   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4342   LLT RoundedTy;
4343 
4344   // S32 vector to to cover all data, plus TFE result element.
4345   LLT TFETy;
4346 
4347   // Register type to use for each loaded component. Will be S32 or V2S16.
4348   LLT RegTy;
4349 
4350   if (IsD16 && ST.hasUnpackedD16VMem()) {
4351     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4352     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4353     RegTy = S32;
4354   } else {
4355     unsigned EltSize = EltTy.getSizeInBits();
4356     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4357     unsigned RoundedSize = 32 * RoundedElts;
4358     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4359     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4360     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4361   }
4362 
4363   // The return type does not need adjustment.
4364   // TODO: Should we change s16 case to s32 or <2 x s16>?
4365   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4366     return true;
4367 
4368   Register Dst1Reg;
4369 
4370   // Insert after the instruction.
4371   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4372 
4373   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4374   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4375   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4376   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4377 
4378   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4379 
4380   MI.getOperand(0).setReg(NewResultReg);
4381 
4382   // In the IR, TFE is supposed to be used with a 2 element struct return
4383   // type. The intruction really returns these two values in one contiguous
4384   // register, with one additional dword beyond the loaded data. Rewrite the
4385   // return type to use a single register result.
4386 
4387   if (IsTFE) {
4388     Dst1Reg = MI.getOperand(1).getReg();
4389     if (MRI->getType(Dst1Reg) != S32)
4390       return false;
4391 
4392     // TODO: Make sure the TFE operand bit is set.
4393     MI.RemoveOperand(1);
4394 
4395     // Handle the easy case that requires no repack instructions.
4396     if (Ty == S32) {
4397       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4398       return true;
4399     }
4400   }
4401 
4402   // Now figure out how to copy the new result register back into the old
4403   // result.
4404   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4405 
4406   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4407 
4408   if (ResultNumRegs == 1) {
4409     assert(!IsTFE);
4410     ResultRegs[0] = NewResultReg;
4411   } else {
4412     // We have to repack into a new vector of some kind.
4413     for (int I = 0; I != NumDataRegs; ++I)
4414       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4415     B.buildUnmerge(ResultRegs, NewResultReg);
4416 
4417     // Drop the final TFE element to get the data part. The TFE result is
4418     // directly written to the right place already.
4419     if (IsTFE)
4420       ResultRegs.resize(NumDataRegs);
4421   }
4422 
4423   // For an s16 scalar result, we form an s32 result with a truncate regardless
4424   // of packed vs. unpacked.
4425   if (IsD16 && !Ty.isVector()) {
4426     B.buildTrunc(DstReg, ResultRegs[0]);
4427     return true;
4428   }
4429 
4430   // Avoid a build/concat_vector of 1 entry.
4431   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4432     B.buildBitcast(DstReg, ResultRegs[0]);
4433     return true;
4434   }
4435 
4436   assert(Ty.isVector());
4437 
4438   if (IsD16) {
4439     // For packed D16 results with TFE enabled, all the data components are
4440     // S32. Cast back to the expected type.
4441     //
4442     // TODO: We don't really need to use load s32 elements. We would only need one
4443     // cast for the TFE result if a multiple of v2s16 was used.
4444     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4445       for (Register &Reg : ResultRegs)
4446         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4447     } else if (ST.hasUnpackedD16VMem()) {
4448       for (Register &Reg : ResultRegs)
4449         Reg = B.buildTrunc(S16, Reg).getReg(0);
4450     }
4451   }
4452 
4453   auto padWithUndef = [&](LLT Ty, int NumElts) {
4454     if (NumElts == 0)
4455       return;
4456     Register Undef = B.buildUndef(Ty).getReg(0);
4457     for (int I = 0; I != NumElts; ++I)
4458       ResultRegs.push_back(Undef);
4459   };
4460 
4461   // Pad out any elements eliminated due to the dmask.
4462   LLT ResTy = MRI->getType(ResultRegs[0]);
4463   if (!ResTy.isVector()) {
4464     padWithUndef(ResTy, NumElts - ResultRegs.size());
4465     B.buildBuildVector(DstReg, ResultRegs);
4466     return true;
4467   }
4468 
4469   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4470   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4471 
4472   // Deal with the one annoying legal case.
4473   const LLT V3S16 = LLT::vector(3, 16);
4474   if (Ty == V3S16) {
4475     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4476     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4477     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4478     return true;
4479   }
4480 
4481   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4482   B.buildConcatVectors(DstReg, ResultRegs);
4483   return true;
4484 }
4485 
4486 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4487   LegalizerHelper &Helper, MachineInstr &MI) const {
4488   MachineIRBuilder &B = Helper.MIRBuilder;
4489   GISelChangeObserver &Observer = Helper.Observer;
4490 
4491   Register Dst = MI.getOperand(0).getReg();
4492   LLT Ty = B.getMRI()->getType(Dst);
4493   unsigned Size = Ty.getSizeInBits();
4494   MachineFunction &MF = B.getMF();
4495 
4496   Observer.changingInstr(MI);
4497 
4498   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4499     Ty = getBitcastRegisterType(Ty);
4500     Helper.bitcastDst(MI, Ty, 0);
4501     Dst = MI.getOperand(0).getReg();
4502     B.setInsertPt(B.getMBB(), MI);
4503   }
4504 
4505   // FIXME: We don't really need this intermediate instruction. The intrinsic
4506   // should be fixed to have a memory operand. Since it's readnone, we're not
4507   // allowed to add one.
4508   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4509   MI.RemoveOperand(1); // Remove intrinsic ID
4510 
4511   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4512   // TODO: Should this use datalayout alignment?
4513   const unsigned MemSize = (Size + 7) / 8;
4514   const Align MemAlign(4);
4515   MachineMemOperand *MMO = MF.getMachineMemOperand(
4516       MachinePointerInfo(),
4517       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4518           MachineMemOperand::MOInvariant,
4519       MemSize, MemAlign);
4520   MI.addMemOperand(MF, MMO);
4521 
4522   // There are no 96-bit result scalar loads, but widening to 128-bit should
4523   // always be legal. We may need to restore this to a 96-bit result if it turns
4524   // out this needs to be converted to a vector load during RegBankSelect.
4525   if (!isPowerOf2_32(Size)) {
4526     if (Ty.isVector())
4527       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4528     else
4529       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4530   }
4531 
4532   Observer.changedInstr(MI);
4533   return true;
4534 }
4535 
4536 // TODO: Move to selection
4537 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4538                                                 MachineRegisterInfo &MRI,
4539                                                 MachineIRBuilder &B) const {
4540   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4541   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4542       !ST.isTrapHandlerEnabled()) {
4543     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4544   } else {
4545     // Pass queue pointer to trap handler as input, and insert trap instruction
4546     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4547     MachineRegisterInfo &MRI = *B.getMRI();
4548 
4549     Register LiveIn =
4550       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4551     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4552       return false;
4553 
4554     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4555     B.buildCopy(SGPR01, LiveIn);
4556     B.buildInstr(AMDGPU::S_TRAP)
4557         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4558         .addReg(SGPR01, RegState::Implicit);
4559   }
4560 
4561   MI.eraseFromParent();
4562   return true;
4563 }
4564 
4565 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4566     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4567   // Is non-HSA path or trap-handler disabled? then, report a warning
4568   // accordingly
4569   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4570       !ST.isTrapHandlerEnabled()) {
4571     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4572                                      "debugtrap handler not supported",
4573                                      MI.getDebugLoc(), DS_Warning);
4574     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4575     Ctx.diagnose(NoTrap);
4576   } else {
4577     // Insert debug-trap instruction
4578     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4579   }
4580 
4581   MI.eraseFromParent();
4582   return true;
4583 }
4584 
4585 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4586                                                MachineIRBuilder &B) const {
4587   MachineRegisterInfo &MRI = *B.getMRI();
4588   const LLT S16 = LLT::scalar(16);
4589   const LLT S32 = LLT::scalar(32);
4590 
4591   Register DstReg = MI.getOperand(0).getReg();
4592   Register NodePtr = MI.getOperand(2).getReg();
4593   Register RayExtent = MI.getOperand(3).getReg();
4594   Register RayOrigin = MI.getOperand(4).getReg();
4595   Register RayDir = MI.getOperand(5).getReg();
4596   Register RayInvDir = MI.getOperand(6).getReg();
4597   Register TDescr = MI.getOperand(7).getReg();
4598 
4599   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4600   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4601   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4602                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4603                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4604                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4605 
4606   SmallVector<Register, 12> Ops;
4607   if (Is64) {
4608     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4609     Ops.push_back(Unmerge.getReg(0));
4610     Ops.push_back(Unmerge.getReg(1));
4611   } else {
4612     Ops.push_back(NodePtr);
4613   }
4614   Ops.push_back(RayExtent);
4615 
4616   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4617     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4618     Ops.push_back(Unmerge.getReg(0));
4619     Ops.push_back(Unmerge.getReg(1));
4620     Ops.push_back(Unmerge.getReg(2));
4621   };
4622 
4623   packLanes(RayOrigin);
4624   if (IsA16) {
4625     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4626     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4627     Register R1 = MRI.createGenericVirtualRegister(S32);
4628     Register R2 = MRI.createGenericVirtualRegister(S32);
4629     Register R3 = MRI.createGenericVirtualRegister(S32);
4630     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4631     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4632     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4633     Ops.push_back(R1);
4634     Ops.push_back(R2);
4635     Ops.push_back(R3);
4636   } else {
4637     packLanes(RayDir);
4638     packLanes(RayInvDir);
4639   }
4640 
4641   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4642     .addDef(DstReg)
4643     .addImm(Opcode);
4644 
4645   for (Register R : Ops) {
4646     MIB.addUse(R);
4647   }
4648 
4649   MIB.addUse(TDescr)
4650      .addImm(IsA16 ? 1 : 0)
4651      .cloneMemRefs(MI);
4652 
4653   MI.eraseFromParent();
4654   return true;
4655 }
4656 
4657 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4658                                             MachineInstr &MI) const {
4659   MachineIRBuilder &B = Helper.MIRBuilder;
4660   MachineRegisterInfo &MRI = *B.getMRI();
4661 
4662   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4663   auto IntrID = MI.getIntrinsicID();
4664   switch (IntrID) {
4665   case Intrinsic::amdgcn_if:
4666   case Intrinsic::amdgcn_else: {
4667     MachineInstr *Br = nullptr;
4668     MachineBasicBlock *UncondBrTarget = nullptr;
4669     bool Negated = false;
4670     if (MachineInstr *BrCond =
4671             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4672       const SIRegisterInfo *TRI
4673         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4674 
4675       Register Def = MI.getOperand(1).getReg();
4676       Register Use = MI.getOperand(3).getReg();
4677 
4678       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4679 
4680       if (Negated)
4681         std::swap(CondBrTarget, UncondBrTarget);
4682 
4683       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4684       if (IntrID == Intrinsic::amdgcn_if) {
4685         B.buildInstr(AMDGPU::SI_IF)
4686           .addDef(Def)
4687           .addUse(Use)
4688           .addMBB(UncondBrTarget);
4689       } else {
4690         B.buildInstr(AMDGPU::SI_ELSE)
4691             .addDef(Def)
4692             .addUse(Use)
4693             .addMBB(UncondBrTarget);
4694       }
4695 
4696       if (Br) {
4697         Br->getOperand(0).setMBB(CondBrTarget);
4698       } else {
4699         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4700         // since we're swapping branch targets it needs to be reinserted.
4701         // FIXME: IRTranslator should probably not do this
4702         B.buildBr(*CondBrTarget);
4703       }
4704 
4705       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4706       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4707       MI.eraseFromParent();
4708       BrCond->eraseFromParent();
4709       return true;
4710     }
4711 
4712     return false;
4713   }
4714   case Intrinsic::amdgcn_loop: {
4715     MachineInstr *Br = nullptr;
4716     MachineBasicBlock *UncondBrTarget = nullptr;
4717     bool Negated = false;
4718     if (MachineInstr *BrCond =
4719             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4720       const SIRegisterInfo *TRI
4721         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4722 
4723       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4724       Register Reg = MI.getOperand(2).getReg();
4725 
4726       if (Negated)
4727         std::swap(CondBrTarget, UncondBrTarget);
4728 
4729       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4730       B.buildInstr(AMDGPU::SI_LOOP)
4731         .addUse(Reg)
4732         .addMBB(UncondBrTarget);
4733 
4734       if (Br)
4735         Br->getOperand(0).setMBB(CondBrTarget);
4736       else
4737         B.buildBr(*CondBrTarget);
4738 
4739       MI.eraseFromParent();
4740       BrCond->eraseFromParent();
4741       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4742       return true;
4743     }
4744 
4745     return false;
4746   }
4747   case Intrinsic::amdgcn_kernarg_segment_ptr:
4748     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4749       // This only makes sense to call in a kernel, so just lower to null.
4750       B.buildConstant(MI.getOperand(0).getReg(), 0);
4751       MI.eraseFromParent();
4752       return true;
4753     }
4754 
4755     return legalizePreloadedArgIntrin(
4756       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4757   case Intrinsic::amdgcn_implicitarg_ptr:
4758     return legalizeImplicitArgPtr(MI, MRI, B);
4759   case Intrinsic::amdgcn_workitem_id_x:
4760     return legalizePreloadedArgIntrin(MI, MRI, B,
4761                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4762   case Intrinsic::amdgcn_workitem_id_y:
4763     return legalizePreloadedArgIntrin(MI, MRI, B,
4764                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4765   case Intrinsic::amdgcn_workitem_id_z:
4766     return legalizePreloadedArgIntrin(MI, MRI, B,
4767                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4768   case Intrinsic::amdgcn_workgroup_id_x:
4769     return legalizePreloadedArgIntrin(MI, MRI, B,
4770                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4771   case Intrinsic::amdgcn_workgroup_id_y:
4772     return legalizePreloadedArgIntrin(MI, MRI, B,
4773                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4774   case Intrinsic::amdgcn_workgroup_id_z:
4775     return legalizePreloadedArgIntrin(MI, MRI, B,
4776                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4777   case Intrinsic::amdgcn_dispatch_ptr:
4778     return legalizePreloadedArgIntrin(MI, MRI, B,
4779                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4780   case Intrinsic::amdgcn_queue_ptr:
4781     return legalizePreloadedArgIntrin(MI, MRI, B,
4782                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4783   case Intrinsic::amdgcn_implicit_buffer_ptr:
4784     return legalizePreloadedArgIntrin(
4785       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4786   case Intrinsic::amdgcn_dispatch_id:
4787     return legalizePreloadedArgIntrin(MI, MRI, B,
4788                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4789   case Intrinsic::amdgcn_fdiv_fast:
4790     return legalizeFDIVFastIntrin(MI, MRI, B);
4791   case Intrinsic::amdgcn_is_shared:
4792     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4793   case Intrinsic::amdgcn_is_private:
4794     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4795   case Intrinsic::amdgcn_wavefrontsize: {
4796     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4797     MI.eraseFromParent();
4798     return true;
4799   }
4800   case Intrinsic::amdgcn_s_buffer_load:
4801     return legalizeSBufferLoad(Helper, MI);
4802   case Intrinsic::amdgcn_raw_buffer_store:
4803   case Intrinsic::amdgcn_struct_buffer_store:
4804     return legalizeBufferStore(MI, MRI, B, false, false);
4805   case Intrinsic::amdgcn_raw_buffer_store_format:
4806   case Intrinsic::amdgcn_struct_buffer_store_format:
4807     return legalizeBufferStore(MI, MRI, B, false, true);
4808   case Intrinsic::amdgcn_raw_tbuffer_store:
4809   case Intrinsic::amdgcn_struct_tbuffer_store:
4810     return legalizeBufferStore(MI, MRI, B, true, true);
4811   case Intrinsic::amdgcn_raw_buffer_load:
4812   case Intrinsic::amdgcn_struct_buffer_load:
4813     return legalizeBufferLoad(MI, MRI, B, false, false);
4814   case Intrinsic::amdgcn_raw_buffer_load_format:
4815   case Intrinsic::amdgcn_struct_buffer_load_format:
4816     return legalizeBufferLoad(MI, MRI, B, true, false);
4817   case Intrinsic::amdgcn_raw_tbuffer_load:
4818   case Intrinsic::amdgcn_struct_tbuffer_load:
4819     return legalizeBufferLoad(MI, MRI, B, true, true);
4820   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4821   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4822   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4823   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4824   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4825   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4826   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4827   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4828   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4829   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4830   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4831   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4832   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4833   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4834   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4835   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4836   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4837   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4838   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4839   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4840   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4841   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4842   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4843   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4844   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4845   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4846   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4847   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4848   case Intrinsic::amdgcn_buffer_atomic_fadd:
4849   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4850   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4851   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4852   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4853     return legalizeBufferAtomic(MI, B, IntrID);
4854   case Intrinsic::amdgcn_atomic_inc:
4855     return legalizeAtomicIncDec(MI, B, true);
4856   case Intrinsic::amdgcn_atomic_dec:
4857     return legalizeAtomicIncDec(MI, B, false);
4858   case Intrinsic::trap:
4859     return legalizeTrapIntrinsic(MI, MRI, B);
4860   case Intrinsic::debugtrap:
4861     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4862   case Intrinsic::amdgcn_rsq_clamp:
4863     return legalizeRsqClampIntrinsic(MI, MRI, B);
4864   case Intrinsic::amdgcn_ds_fadd:
4865   case Intrinsic::amdgcn_ds_fmin:
4866   case Intrinsic::amdgcn_ds_fmax:
4867     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4868   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4869     return legalizeBVHIntrinsic(MI, B);
4870   default: {
4871     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4872             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4873       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4874     return true;
4875   }
4876   }
4877 
4878   return true;
4879 }
4880