1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38   "amdgpu-global-isel-new-legality",
39   cl::desc("Use GlobalISel desired legality, rather than try to use"
40            "rules compatible with selection patterns"),
41   cl::init(false),
42   cl::ReallyHidden);
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     if (!Ty.isVector())
67       return false;
68 
69     const LLT EltTy = Ty.getElementType();
70     const unsigned EltSize = EltTy.getSizeInBits();
71     return Ty.getNumElements() % 2 != 0 &&
72            EltSize > 1 && EltSize < 32 &&
73            Ty.getSizeInBits() % 32 != 0;
74   };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78   return [=](const LegalityQuery &Query) {
79     const LLT Ty = Query.Types[TypeIdx];
80     return Ty.getSizeInBits() % 32 == 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130   const unsigned Size = Ty.getSizeInBits();
131 
132   LLT CoercedTy;
133   if (Size <= 32) {
134     // <2 x s8> -> s16
135     // <4 x s8> -> s32
136     return LLT::scalar(Size);
137   }
138 
139   return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143   return [=](const LegalityQuery &Query) {
144     const LLT Ty = Query.Types[TypeIdx];
145     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146   };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT Ty = Query.Types[TypeIdx];
152     unsigned Size = Ty.getSizeInBits();
153     assert(Size % 32 == 0);
154     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155   };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159   return [=](const LegalityQuery &Query) {
160     const LLT QueryTy = Query.Types[TypeIdx];
161     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162   };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166   return [=](const LegalityQuery &Query) {
167     const LLT QueryTy = Query.Types[TypeIdx];
168     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169   };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173   return [=](const LegalityQuery &Query) {
174     const LLT QueryTy = Query.Types[TypeIdx];
175     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176   };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180   return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184   const int EltSize = EltTy.getSizeInBits();
185   return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189   const int EltSize = Ty.getElementType().getSizeInBits();
190   return EltSize == 32 || EltSize == 64 ||
191          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192          EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196   if (!isRegisterSize(Ty.getSizeInBits()))
197     return false;
198 
199   if (Ty.isVector())
200     return isRegisterVectorType(Ty);
201 
202   return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208   return [=](const LegalityQuery &Query) {
209     return isRegisterType(Query.Types[TypeIdx]);
210   };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     const LLT QueryTy = Query.Types[TypeIdx];
216     if (!QueryTy.isVector())
217       return false;
218     const LLT EltTy = QueryTy.getElementType();
219     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220   };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224   return [=](const LegalityQuery &Query) {
225     const LLT Ty = Query.Types[TypeIdx];
226     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228   };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235                                     bool IsLoad) {
236   switch (AS) {
237   case AMDGPUAS::PRIVATE_ADDRESS:
238     // FIXME: Private element size.
239     return ST.enableFlatScratch() ? 128 : 32;
240   case AMDGPUAS::LOCAL_ADDRESS:
241     return ST.useDS128() ? 128 : 64;
242   case AMDGPUAS::GLOBAL_ADDRESS:
243   case AMDGPUAS::CONSTANT_ADDRESS:
244   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245     // Treat constant and global as identical. SMRD loads are sometimes usable for
246     // global loads (ideally constant address space should be eliminated)
247     // depending on the context. Legality cannot be context dependent, but
248     // RegBankSelect can split the load as necessary depending on the pointer
249     // register bank/uniformity and if the memory is invariant or not written in a
250     // kernel.
251     return IsLoad ? 512 : 128;
252   default:
253     // Flat addresses may contextually need to be split to 32-bit parts if they
254     // may alias scratch depending on the subtarget.
255     return 128;
256   }
257 }
258 
259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260                                  const LegalityQuery &Query,
261                                  unsigned Opcode) {
262   const LLT Ty = Query.Types[0];
263 
264   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265   const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267   unsigned RegSize = Ty.getSizeInBits();
268   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270   unsigned AS = Query.Types[1].getAddressSpace();
271 
272   // All of these need to be custom lowered to cast the pointer operand.
273   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274     return false;
275 
276   // TODO: We should be able to widen loads if the alignment is high enough, but
277   // we also need to modify the memory access size.
278 #if 0
279   // Accept widening loads based on alignment.
280   if (IsLoad && MemSize < Size)
281     MemSize = std::max(MemSize, Align);
282 #endif
283 
284   // Only 1-byte and 2-byte to 32-bit extloads are valid.
285   if (MemSize != RegSize && RegSize != 32)
286     return false;
287 
288   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289     return false;
290 
291   switch (MemSize) {
292   case 8:
293   case 16:
294   case 32:
295   case 64:
296   case 128:
297     break;
298   case 96:
299     if (!ST.hasDwordx3LoadStores())
300       return false;
301     break;
302   case 256:
303   case 512:
304     // These may contextually need to be broken down.
305     break;
306   default:
307     return false;
308   }
309 
310   assert(RegSize >= MemSize);
311 
312   if (AlignBits < MemSize) {
313     const SITargetLowering *TLI = ST.getTargetLowering();
314     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315                                                  Align(AlignBits / 8)))
316       return false;
317   }
318 
319   return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327   if (EnableNewLegality)
328     return false;
329 
330   const unsigned Size = Ty.getSizeInBits();
331   if (Size <= 64)
332     return false;
333   if (!Ty.isVector())
334     return true;
335 
336   LLT EltTy = Ty.getElementType();
337   if (EltTy.isPointer())
338     return true;
339 
340   unsigned EltSize = EltTy.getSizeInBits();
341   return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345                              unsigned Opcode) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583     .customFor({S32, S64})
584     .clampScalar(0, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .scalarize(0);
587 
588   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589                    .legalFor({S32})
590                    .maxScalarOrElt(0, S32);
591 
592   if (ST.hasVOP3PInsts()) {
593     Mulh
594       .clampMaxNumElements(0, S8, 2)
595       .lowerFor({V2S8});
596   }
597 
598   Mulh
599     .scalarize(0)
600     .lower();
601 
602   // Report legal for any types we can handle anywhere. For the cases only legal
603   // on the SALU, RegBankSelect will be able to re-legalize.
604   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606     .clampScalar(0, S32, S64)
607     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609     .widenScalarToNextPow2(0)
610     .scalarize(0);
611 
612   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614     .legalFor({{S32, S1}, {S32, S32}})
615     .minScalar(0, S32)
616     // TODO: .scalarize(0)
617     .lower();
618 
619   getActionDefinitionsBuilder(G_BITCAST)
620     // Don't worry about the size constraint.
621     .legalIf(all(isRegisterType(0), isRegisterType(1)))
622     .lower();
623 
624 
625   getActionDefinitionsBuilder(G_CONSTANT)
626     .legalFor({S1, S32, S64, S16, GlobalPtr,
627                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628     .legalIf(isPointer(0))
629     .clampScalar(0, S32, S64)
630     .widenScalarToNextPow2(0);
631 
632   getActionDefinitionsBuilder(G_FCONSTANT)
633     .legalFor({S32, S64, S16})
634     .clampScalar(0, S16, S64);
635 
636   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637       .legalIf(isRegisterType(0))
638       // s1 and s16 are special cases because they have legal operations on
639       // them, but don't really occupy registers in the normal way.
640       .legalFor({S1, S16})
641       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642       .clampScalarOrElt(0, S32, MaxScalar)
643       .widenScalarToNextPow2(0, 32)
644       .clampMaxNumElements(0, S32, 16);
645 
646   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648   // If the amount is divergent, we have to do a wave reduction to get the
649   // maximum value, so this is expanded during RegBankSelect.
650   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651     .legalFor({{PrivatePtr, S32}});
652 
653   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654     .customIf(typeIsNot(0, PrivatePtr));
655 
656   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658   auto &FPOpActions = getActionDefinitionsBuilder(
659     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660     .legalFor({S32, S64});
661   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662     .customFor({S32, S64});
663   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664     .customFor({S32, S64});
665 
666   if (ST.has16BitInsts()) {
667     if (ST.hasVOP3PInsts())
668       FPOpActions.legalFor({S16, V2S16});
669     else
670       FPOpActions.legalFor({S16});
671 
672     TrigActions.customFor({S16});
673     FDIVActions.customFor({S16});
674   }
675 
676   auto &MinNumMaxNum = getActionDefinitionsBuilder({
677       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679   if (ST.hasVOP3PInsts()) {
680     MinNumMaxNum.customFor(FPTypesPK16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .clampMaxNumElements(0, S16, 2)
683       .clampScalar(0, S16, S64)
684       .scalarize(0);
685   } else if (ST.has16BitInsts()) {
686     MinNumMaxNum.customFor(FPTypes16)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else {
690     MinNumMaxNum.customFor(FPTypesBase)
691       .clampScalar(0, S32, S64)
692       .scalarize(0);
693   }
694 
695   if (ST.hasVOP3PInsts())
696     FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698   FPOpActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   TrigActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   FDIVActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   getActionDefinitionsBuilder({G_FNEG, G_FABS})
711     .legalFor(FPTypesPK16)
712     .clampMaxNumElements(0, S16, 2)
713     .scalarize(0)
714     .clampScalar(0, S16, S64);
715 
716   if (ST.has16BitInsts()) {
717     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718       .legalFor({S32, S64, S16})
719       .scalarize(0)
720       .clampScalar(0, S16, S64);
721   } else {
722     getActionDefinitionsBuilder(G_FSQRT)
723       .legalFor({S32, S64})
724       .scalarize(0)
725       .clampScalar(0, S32, S64);
726 
727     if (ST.hasFractBug()) {
728       getActionDefinitionsBuilder(G_FFLOOR)
729         .customFor({S64})
730         .legalFor({S32, S64})
731         .scalarize(0)
732         .clampScalar(0, S32, S64);
733     } else {
734       getActionDefinitionsBuilder(G_FFLOOR)
735         .legalFor({S32, S64})
736         .scalarize(0)
737         .clampScalar(0, S32, S64);
738     }
739   }
740 
741   getActionDefinitionsBuilder(G_FPTRUNC)
742     .legalFor({{S32, S64}, {S16, S32}})
743     .scalarize(0)
744     .lower();
745 
746   getActionDefinitionsBuilder(G_FPEXT)
747     .legalFor({{S64, S32}, {S32, S16}})
748     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749     .scalarize(0);
750 
751   getActionDefinitionsBuilder(G_FSUB)
752       // Use actual fsub instruction
753       .legalFor({S32})
754       // Must use fadd + fneg
755       .lowerFor({S64, S16, V2S16})
756       .scalarize(0)
757       .clampScalar(0, S32, S64);
758 
759   // Whether this is legal depends on the floating point mode for the function.
760   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762     FMad.customFor({S32, S16});
763   else if (ST.hasMadMacF32Insts())
764     FMad.customFor({S32});
765   else if (ST.hasMadF16())
766     FMad.customFor({S16});
767   FMad.scalarize(0)
768       .lower();
769 
770   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771   if (ST.has16BitInsts()) {
772     FRem.customFor({S16, S32, S64});
773   } else {
774     FRem.minScalar(0, S32)
775         .customFor({S32, S64});
776   }
777   FRem.scalarize(0);
778 
779   // TODO: Do we need to clamp maximum bitwidth?
780   getActionDefinitionsBuilder(G_TRUNC)
781     .legalIf(isScalar(0))
782     .legalFor({{V2S16, V2S32}})
783     .clampMaxNumElements(0, S16, 2)
784     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785     // situations (like an invalid implicit use), we don't want to infinite loop
786     // in the legalizer.
787     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788     .alwaysLegal();
789 
790   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792                {S32, S1}, {S64, S1}, {S16, S1}})
793     .scalarize(0)
794     .clampScalar(0, S32, S64)
795     .widenScalarToNextPow2(1, 32);
796 
797   // TODO: Split s1->s64 during regbankselect for VALU.
798   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800     .lowerFor({{S32, S64}})
801     .lowerIf(typeIs(1, S1))
802     .customFor({{S64, S64}});
803   if (ST.has16BitInsts())
804     IToFP.legalFor({{S16, S16}});
805   IToFP.clampScalar(1, S32, S64)
806        .minScalar(0, S32)
807        .scalarize(0)
808        .widenScalarToNextPow2(1);
809 
810   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812     .customFor({{S64, S64}})
813     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814   if (ST.has16BitInsts())
815     FPToI.legalFor({{S16, S16}});
816   else
817     FPToI.minScalar(1, S32);
818 
819   FPToI.minScalar(0, S32)
820        .scalarize(0)
821        .lower();
822 
823   // Lower roundeven into G_FRINT
824   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825     .scalarize(0)
826     .lower();
827 
828   if (ST.has16BitInsts()) {
829     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830       .legalFor({S16, S32, S64})
831       .clampScalar(0, S16, S64)
832       .scalarize(0);
833   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835       .legalFor({S32, S64})
836       .clampScalar(0, S32, S64)
837       .scalarize(0);
838   } else {
839     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840       .legalFor({S32})
841       .customFor({S64})
842       .clampScalar(0, S32, S64)
843       .scalarize(0);
844   }
845 
846   getActionDefinitionsBuilder(G_PTR_ADD)
847     .legalIf(all(isPointer(0), sameSize(0, 1)))
848     .scalarize(0)
849     .scalarSameSizeAs(1, 0);
850 
851   getActionDefinitionsBuilder(G_PTRMASK)
852     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853     .scalarSameSizeAs(1, 0)
854     .scalarize(0);
855 
856   auto &CmpBuilder =
857     getActionDefinitionsBuilder(G_ICMP)
858     // The compare output type differs based on the register bank of the output,
859     // so make both s1 and s32 legal.
860     //
861     // Scalar compares producing output in scc will be promoted to s32, as that
862     // is the allocatable register type that will be needed for the copy from
863     // scc. This will be promoted during RegBankSelect, and we assume something
864     // before that won't try to use s32 result types.
865     //
866     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867     // bank.
868     .legalForCartesianProduct(
869       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870     .legalForCartesianProduct(
871       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872   if (ST.has16BitInsts()) {
873     CmpBuilder.legalFor({{S1, S16}});
874   }
875 
876   CmpBuilder
877     .widenScalarToNextPow2(1)
878     .clampScalar(1, S32, S64)
879     .scalarize(0)
880     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882   getActionDefinitionsBuilder(G_FCMP)
883     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884     .widenScalarToNextPow2(1)
885     .clampScalar(1, S32, S64)
886     .scalarize(0);
887 
888   // FIXME: fpow has a selection pattern that should move to custom lowering.
889   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890   if (ST.has16BitInsts())
891     Exp2Ops.legalFor({S32, S16});
892   else
893     Exp2Ops.legalFor({S32});
894   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895   Exp2Ops.scalarize(0);
896 
897   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898   if (ST.has16BitInsts())
899     ExpOps.customFor({{S32}, {S16}});
900   else
901     ExpOps.customFor({S32});
902   ExpOps.clampScalar(0, MinScalarFPTy, S32)
903         .scalarize(0);
904 
905   getActionDefinitionsBuilder(G_FPOWI)
906     .clampScalar(0, MinScalarFPTy, S32)
907     .lower();
908 
909   // The 64-bit versions produce 32-bit results, but only on the SALU.
910   getActionDefinitionsBuilder(G_CTPOP)
911     .legalFor({{S32, S32}, {S32, S64}})
912     .clampScalar(0, S32, S32)
913     .clampScalar(1, S32, S64)
914     .scalarize(0)
915     .widenScalarToNextPow2(0, 32)
916     .widenScalarToNextPow2(1, 32);
917 
918   // The hardware instructions return a different result on 0 than the generic
919   // instructions expect. The hardware produces -1, but these produce the
920   // bitwidth.
921   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922     .scalarize(0)
923     .clampScalar(0, S32, S32)
924     .clampScalar(1, S32, S64)
925     .widenScalarToNextPow2(0, 32)
926     .widenScalarToNextPow2(1, 32)
927     .lower();
928 
929   // The 64-bit versions produce 32-bit results, but only on the SALU.
930   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931     .legalFor({{S32, S32}, {S32, S64}})
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32)
936     .widenScalarToNextPow2(1, 32);
937 
938   getActionDefinitionsBuilder(G_BITREVERSE)
939     .legalFor({S32})
940     .clampScalar(0, S32, S32)
941     .scalarize(0);
942 
943   if (ST.has16BitInsts()) {
944     getActionDefinitionsBuilder(G_BSWAP)
945       .legalFor({S16, S32, V2S16})
946       .clampMaxNumElements(0, S16, 2)
947       // FIXME: Fixing non-power-of-2 before clamp is workaround for
948       // narrowScalar limitation.
949       .widenScalarToNextPow2(0)
950       .clampScalar(0, S16, S32)
951       .scalarize(0);
952 
953     if (ST.hasVOP3PInsts()) {
954       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955         .legalFor({S32, S16, V2S16})
956         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
957         .clampMaxNumElements(0, S16, 2)
958         .minScalar(0, S16)
959         .widenScalarToNextPow2(0)
960         .scalarize(0)
961         .lower();
962     } else {
963       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964         .legalFor({S32, S16})
965         .widenScalarToNextPow2(0)
966         .minScalar(0, S16)
967         .scalarize(0)
968         .lower();
969     }
970   } else {
971     // TODO: Should have same legality without v_perm_b32
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S32})
974       .lowerIf(scalarNarrowerThan(0, 32))
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .maxScalar(0, S32)
979       .scalarize(0)
980       .lower();
981 
982     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
983       .legalFor({S32})
984       .minScalar(0, S32)
985       .widenScalarToNextPow2(0)
986       .scalarize(0)
987       .lower();
988   }
989 
990   getActionDefinitionsBuilder(G_INTTOPTR)
991     // List the common cases
992     .legalForCartesianProduct(AddrSpaces64, {S64})
993     .legalForCartesianProduct(AddrSpaces32, {S32})
994     .scalarize(0)
995     // Accept any address space as long as the size matches
996     .legalIf(sameSize(0, 1))
997     .widenScalarIf(smallerThan(1, 0),
998       [](const LegalityQuery &Query) {
999         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1000       })
1001     .narrowScalarIf(largerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       });
1005 
1006   getActionDefinitionsBuilder(G_PTRTOINT)
1007     // List the common cases
1008     .legalForCartesianProduct(AddrSpaces64, {S64})
1009     .legalForCartesianProduct(AddrSpaces32, {S32})
1010     .scalarize(0)
1011     // Accept any address space as long as the size matches
1012     .legalIf(sameSize(0, 1))
1013     .widenScalarIf(smallerThan(0, 1),
1014       [](const LegalityQuery &Query) {
1015         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1016       })
1017     .narrowScalarIf(
1018       largerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       });
1022 
1023   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1024     .scalarize(0)
1025     .custom();
1026 
1027   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1028                                     bool IsLoad) -> bool {
1029     const LLT DstTy = Query.Types[0];
1030 
1031     // Split vector extloads.
1032     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034 
1035     if (MemSize < DstTy.getSizeInBits())
1036       MemSize = std::max(MemSize, AlignBits);
1037 
1038     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1039       return true;
1040 
1041     const LLT PtrTy = Query.Types[1];
1042     unsigned AS = PtrTy.getAddressSpace();
1043     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1044       return true;
1045 
1046     // Catch weird sized loads that don't evenly divide into the access sizes
1047     // TODO: May be able to widen depending on alignment etc.
1048     unsigned NumRegs = (MemSize + 31) / 32;
1049     if (NumRegs == 3) {
1050       if (!ST.hasDwordx3LoadStores())
1051         return true;
1052     } else {
1053       // If the alignment allows, these should have been widened.
1054       if (!isPowerOf2_32(NumRegs))
1055         return true;
1056     }
1057 
1058     if (AlignBits < MemSize) {
1059       const SITargetLowering *TLI = ST.getTargetLowering();
1060       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061                                                       Align(AlignBits / 8));
1062     }
1063 
1064     return false;
1065   };
1066 
1067   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1070 
1071   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1072   // LDS
1073   // TODO: Unsupported flat for SI.
1074 
1075   for (unsigned Op : {G_LOAD, G_STORE}) {
1076     const bool IsStore = Op == G_STORE;
1077 
1078     auto &Actions = getActionDefinitionsBuilder(Op);
1079     // Explicitly list some common cases.
1080     // TODO: Does this help compile time at all?
1081     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1083                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1084                                       {S64, GlobalPtr, 64, GlobalAlign32},
1085                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1086                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1087                                       {S32, GlobalPtr, 8, GlobalAlign8},
1088                                       {S32, GlobalPtr, 16, GlobalAlign16},
1089 
1090                                       {S32, LocalPtr, 32, 32},
1091                                       {S64, LocalPtr, 64, 32},
1092                                       {V2S32, LocalPtr, 64, 32},
1093                                       {S32, LocalPtr, 8, 8},
1094                                       {S32, LocalPtr, 16, 16},
1095                                       {V2S16, LocalPtr, 32, 32},
1096 
1097                                       {S32, PrivatePtr, 32, 32},
1098                                       {S32, PrivatePtr, 8, 8},
1099                                       {S32, PrivatePtr, 16, 16},
1100                                       {V2S16, PrivatePtr, 32, 32},
1101 
1102                                       {S32, ConstantPtr, 32, GlobalAlign32},
1103                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1104                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1105                                       {S64, ConstantPtr, 64, GlobalAlign32},
1106                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1107     Actions.legalIf(
1108       [=](const LegalityQuery &Query) -> bool {
1109         return isLoadStoreLegal(ST, Query, Op);
1110       });
1111 
1112     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1113     // 64-bits.
1114     //
1115     // TODO: Should generalize bitcast action into coerce, which will also cover
1116     // inserting addrspacecasts.
1117     Actions.customIf(typeIs(1, Constant32Ptr));
1118 
1119     // Turn any illegal element vectors into something easier to deal
1120     // with. These will ultimately produce 32-bit scalar shifts to extract the
1121     // parts anyway.
1122     //
1123     // For odd 16-bit element vectors, prefer to split those into pieces with
1124     // 16-bit vector parts.
1125     Actions.bitcastIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128                                           Query.MMODescrs[0].SizeInBits);
1129       }, bitcastToRegisterType(0));
1130 
1131     if (!IsStore) {
1132       // Widen suitably aligned loads by loading extra bytes. The standard
1133       // legalization actions can't properly express widening memory operands.
1134       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135         return shouldWidenLoad(ST, Query, G_LOAD);
1136       });
1137     }
1138 
1139     // FIXME: load/store narrowing should be moved to lower action
1140     Actions
1141         .narrowScalarIf(
1142             [=](const LegalityQuery &Query) -> bool {
1143               return !Query.Types[0].isVector() &&
1144                      needToSplitMemOp(Query, Op == G_LOAD);
1145             },
1146             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1147               const LLT DstTy = Query.Types[0];
1148               const LLT PtrTy = Query.Types[1];
1149 
1150               const unsigned DstSize = DstTy.getSizeInBits();
1151               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1152 
1153               // Split extloads.
1154               if (DstSize > MemSize)
1155                 return std::make_pair(0, LLT::scalar(MemSize));
1156 
1157               if (!isPowerOf2_32(DstSize)) {
1158                 // We're probably decomposing an odd sized store. Try to split
1159                 // to the widest type. TODO: Account for alignment. As-is it
1160                 // should be OK, since the new parts will be further legalized.
1161                 unsigned FloorSize = PowerOf2Floor(DstSize);
1162                 return std::make_pair(0, LLT::scalar(FloorSize));
1163               }
1164 
1165               if (DstSize > 32 && (DstSize % 32 != 0)) {
1166                 // FIXME: Need a way to specify non-extload of larger size if
1167                 // suitably aligned.
1168                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1169               }
1170 
1171               unsigned MaxSize = maxSizeForAddrSpace(ST,
1172                                                      PtrTy.getAddressSpace(),
1173                                                      Op == G_LOAD);
1174               if (MemSize > MaxSize)
1175                 return std::make_pair(0, LLT::scalar(MaxSize));
1176 
1177               unsigned Align = Query.MMODescrs[0].AlignInBits;
1178               return std::make_pair(0, LLT::scalar(Align));
1179             })
1180         .fewerElementsIf(
1181             [=](const LegalityQuery &Query) -> bool {
1182               return Query.Types[0].isVector() &&
1183                      needToSplitMemOp(Query, Op == G_LOAD);
1184             },
1185             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1186               const LLT DstTy = Query.Types[0];
1187               const LLT PtrTy = Query.Types[1];
1188 
1189               LLT EltTy = DstTy.getElementType();
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193 
1194               // FIXME: Handle widened to power of 2 results better. This ends
1195               // up scalarizing.
1196               // FIXME: 3 element stores scalarized on SI
1197 
1198               // Split if it's too large for the address space.
1199               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(0,
1217                                       LLT::vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1234               }
1235 
1236               // Need to split because of alignment.
1237               unsigned Align = Query.MMODescrs[0].AlignInBits;
1238               if (EltSize > Align &&
1239                   (EltSize / Align < DstTy.getNumElements())) {
1240                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1241               }
1242 
1243               // May need relegalization for the scalars.
1244               return std::make_pair(0, EltTy);
1245             })
1246     .lowerIfMemSizeNotPow2()
1247     .minScalar(0, S32);
1248 
1249     if (IsStore)
1250       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1251 
1252     Actions
1253         .widenScalarToNextPow2(0)
1254         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255         .lower();
1256   }
1257 
1258   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260                                                   {S32, GlobalPtr, 16, 2 * 8},
1261                                                   {S32, LocalPtr, 8, 8},
1262                                                   {S32, LocalPtr, 16, 16},
1263                                                   {S32, PrivatePtr, 8, 8},
1264                                                   {S32, PrivatePtr, 16, 16},
1265                                                   {S32, ConstantPtr, 8, 8},
1266                                                   {S32, ConstantPtr, 16, 2 * 8}});
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1270   }
1271 
1272   ExtLoads.clampScalar(0, S32, S32)
1273           .widenScalarToNextPow2(0)
1274           .unsupportedIfMemSizeNotPow2()
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   if (ST.hasLDSFPAtomics()) {
1290     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292   }
1293 
1294   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1295   // demarshalling
1296   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298                 {S32, FlatPtr}, {S64, FlatPtr}})
1299     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300                {S32, RegionPtr}, {S64, RegionPtr}});
1301   // TODO: Pointer types, any 32-bit or 64-bit vector
1302 
1303   // Condition should be s32 for scalar, s1 for vector.
1304   getActionDefinitionsBuilder(G_SELECT)
1305     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1306           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1307           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1308     .clampScalar(0, S16, S64)
1309     .scalarize(1)
1310     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1311     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1312     .clampMaxNumElements(0, S32, 2)
1313     .clampMaxNumElements(0, LocalPtr, 2)
1314     .clampMaxNumElements(0, PrivatePtr, 2)
1315     .scalarize(0)
1316     .widenScalarToNextPow2(0)
1317     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1318 
1319   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1320   // be more flexible with the shift amount type.
1321   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1322     .legalFor({{S32, S32}, {S64, S32}});
1323   if (ST.has16BitInsts()) {
1324     if (ST.hasVOP3PInsts()) {
1325       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1326             .clampMaxNumElements(0, S16, 2);
1327     } else
1328       Shifts.legalFor({{S16, S16}});
1329 
1330     // TODO: Support 16-bit shift amounts for all types
1331     Shifts.widenScalarIf(
1332       [=](const LegalityQuery &Query) {
1333         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1334         // 32-bit amount.
1335         const LLT ValTy = Query.Types[0];
1336         const LLT AmountTy = Query.Types[1];
1337         return ValTy.getSizeInBits() <= 16 &&
1338                AmountTy.getSizeInBits() < 16;
1339       }, changeTo(1, S16));
1340     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1341     Shifts.clampScalar(1, S32, S32);
1342     Shifts.clampScalar(0, S16, S64);
1343     Shifts.widenScalarToNextPow2(0, 16);
1344 
1345     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1346       .minScalar(0, S16)
1347       .scalarize(0)
1348       .lower();
1349   } else {
1350     // Make sure we legalize the shift amount type first, as the general
1351     // expansion for the shifted type will produce much worse code if it hasn't
1352     // been truncated already.
1353     Shifts.clampScalar(1, S32, S32);
1354     Shifts.clampScalar(0, S32, S64);
1355     Shifts.widenScalarToNextPow2(0, 32);
1356 
1357     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1358       .minScalar(0, S32)
1359       .scalarize(0)
1360       .lower();
1361   }
1362   Shifts.scalarize(0);
1363 
1364   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1365     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1366     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1367     unsigned IdxTypeIdx = 2;
1368 
1369     getActionDefinitionsBuilder(Op)
1370       .customIf([=](const LegalityQuery &Query) {
1371           const LLT EltTy = Query.Types[EltTypeIdx];
1372           const LLT VecTy = Query.Types[VecTypeIdx];
1373           const LLT IdxTy = Query.Types[IdxTypeIdx];
1374           const unsigned EltSize = EltTy.getSizeInBits();
1375           return (EltSize == 32 || EltSize == 64) &&
1376                   VecTy.getSizeInBits() % 32 == 0 &&
1377                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1378                   IdxTy.getSizeInBits() == 32;
1379         })
1380       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1381                  bitcastToVectorElement32(VecTypeIdx))
1382       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1383       .bitcastIf(
1384         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1385         [=](const LegalityQuery &Query) {
1386           // For > 64-bit element types, try to turn this into a 64-bit
1387           // element vector since we may be able to do better indexing
1388           // if this is scalar. If not, fall back to 32.
1389           const LLT EltTy = Query.Types[EltTypeIdx];
1390           const LLT VecTy = Query.Types[VecTypeIdx];
1391           const unsigned DstEltSize = EltTy.getSizeInBits();
1392           const unsigned VecSize = VecTy.getSizeInBits();
1393 
1394           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395           return std::make_pair(
1396             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1397         })
1398       .clampScalar(EltTypeIdx, S32, S64)
1399       .clampScalar(VecTypeIdx, S32, S64)
1400       .clampScalar(IdxTypeIdx, S32, S32)
1401       .clampMaxNumElements(VecTypeIdx, S32, 32)
1402       // TODO: Clamp elements for 64-bit vectors?
1403       // It should only be necessary with variable indexes.
1404       // As a last resort, lower to the stack
1405       .lower();
1406   }
1407 
1408   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1409     .unsupportedIf([=](const LegalityQuery &Query) {
1410         const LLT &EltTy = Query.Types[1].getElementType();
1411         return Query.Types[0] != EltTy;
1412       });
1413 
1414   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1415     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1416     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1417 
1418     // FIXME: Doesn't handle extract of illegal sizes.
1419     getActionDefinitionsBuilder(Op)
1420       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1421       // FIXME: Multiples of 16 should not be legal.
1422       .legalIf([=](const LegalityQuery &Query) {
1423           const LLT BigTy = Query.Types[BigTyIdx];
1424           const LLT LitTy = Query.Types[LitTyIdx];
1425           return (BigTy.getSizeInBits() % 32 == 0) &&
1426                  (LitTy.getSizeInBits() % 16 == 0);
1427         })
1428       .widenScalarIf(
1429         [=](const LegalityQuery &Query) {
1430           const LLT BigTy = Query.Types[BigTyIdx];
1431           return (BigTy.getScalarSizeInBits() < 16);
1432         },
1433         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1434       .widenScalarIf(
1435         [=](const LegalityQuery &Query) {
1436           const LLT LitTy = Query.Types[LitTyIdx];
1437           return (LitTy.getScalarSizeInBits() < 16);
1438         },
1439         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1440       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1441       .widenScalarToNextPow2(BigTyIdx, 32);
1442 
1443   }
1444 
1445   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1446     .legalForCartesianProduct(AllS32Vectors, {S32})
1447     .legalForCartesianProduct(AllS64Vectors, {S64})
1448     .clampNumElements(0, V16S32, V32S32)
1449     .clampNumElements(0, V2S64, V16S64)
1450     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1451 
1452   if (ST.hasScalarPackInsts()) {
1453     BuildVector
1454       // FIXME: Should probably widen s1 vectors straight to s32
1455       .minScalarOrElt(0, S16)
1456       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1457       .minScalar(1, S32);
1458 
1459     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460       .legalFor({V2S16, S32})
1461       .lower();
1462     BuildVector.minScalarOrElt(0, S32);
1463   } else {
1464     BuildVector.customFor({V2S16, S16});
1465     BuildVector.minScalarOrElt(0, S32);
1466 
1467     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1468       .customFor({V2S16, S32})
1469       .lower();
1470   }
1471 
1472   BuildVector.legalIf(isRegisterType(0));
1473 
1474   // FIXME: Clamp maximum size
1475   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1476     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1477     .clampMaxNumElements(0, S32, 32)
1478     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1479     .clampMaxNumElements(0, S16, 64);
1480 
1481   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1482   // pre-legalize.
1483   if (ST.hasVOP3PInsts()) {
1484     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1485       .customFor({V2S16, V2S16})
1486       .lower();
1487   } else
1488     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1489 
1490   // Merge/Unmerge
1491   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1492     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1493     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1494 
1495     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1496       const LLT Ty = Query.Types[TypeIdx];
1497       if (Ty.isVector()) {
1498         const LLT &EltTy = Ty.getElementType();
1499         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1500           return true;
1501         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1502           return true;
1503       }
1504       return false;
1505     };
1506 
1507     auto &Builder = getActionDefinitionsBuilder(Op)
1508       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1509       .lowerFor({{S16, V2S16}})
1510       .lowerIf([=](const LegalityQuery &Query) {
1511           const LLT BigTy = Query.Types[BigTyIdx];
1512           return BigTy.getSizeInBits() == 32;
1513         })
1514       // Try to widen to s16 first for small types.
1515       // TODO: Only do this on targets with legal s16 shifts
1516       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1517       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1518       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1519       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1520                            elementTypeIs(1, S16)),
1521                        changeTo(1, V2S16))
1522       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1523       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1524       // valid.
1525       .clampScalar(LitTyIdx, S32, S512)
1526       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1527       // Break up vectors with weird elements into scalars
1528       .fewerElementsIf(
1529         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1530         scalarize(0))
1531       .fewerElementsIf(
1532         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1533         scalarize(1))
1534       .clampScalar(BigTyIdx, S32, MaxScalar);
1535 
1536     if (Op == G_MERGE_VALUES) {
1537       Builder.widenScalarIf(
1538         // TODO: Use 16-bit shifts if legal for 8-bit values?
1539         [=](const LegalityQuery &Query) {
1540           const LLT Ty = Query.Types[LitTyIdx];
1541           return Ty.getSizeInBits() < 32;
1542         },
1543         changeTo(LitTyIdx, S32));
1544     }
1545 
1546     Builder.widenScalarIf(
1547       [=](const LegalityQuery &Query) {
1548         const LLT Ty = Query.Types[BigTyIdx];
1549         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1550           Ty.getSizeInBits() % 16 != 0;
1551       },
1552       [=](const LegalityQuery &Query) {
1553         // Pick the next power of 2, or a multiple of 64 over 128.
1554         // Whichever is smaller.
1555         const LLT &Ty = Query.Types[BigTyIdx];
1556         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1557         if (NewSizeInBits >= 256) {
1558           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1559           if (RoundedTo < NewSizeInBits)
1560             NewSizeInBits = RoundedTo;
1561         }
1562         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1563       })
1564       // Any vectors left are the wrong size. Scalarize them.
1565       .scalarize(0)
1566       .scalarize(1);
1567   }
1568 
1569   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1570   // RegBankSelect.
1571   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1572     .legalFor({{S32}, {S64}});
1573 
1574   if (ST.hasVOP3PInsts()) {
1575     SextInReg.lowerFor({{V2S16}})
1576       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1577       // get more vector shift opportunities, since we'll get those when
1578       // expanded.
1579       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1580   } else if (ST.has16BitInsts()) {
1581     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1582   } else {
1583     // Prefer to promote to s32 before lowering if we don't have 16-bit
1584     // shifts. This avoid a lot of intermediate truncate and extend operations.
1585     SextInReg.lowerFor({{S32}, {S64}});
1586   }
1587 
1588   SextInReg
1589     .scalarize(0)
1590     .clampScalar(0, S32, S64)
1591     .lower();
1592 
1593   getActionDefinitionsBuilder(G_FSHR)
1594     .legalFor({{S32, S32}})
1595     .scalarize(0)
1596     .lower();
1597 
1598   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1599     .legalFor({S64});
1600 
1601   getActionDefinitionsBuilder(G_FENCE)
1602     .alwaysLegal();
1603 
1604   getActionDefinitionsBuilder({
1605       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1606       G_FCOPYSIGN,
1607 
1608       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1609       G_ATOMICRMW_NAND,
1610       G_ATOMICRMW_FSUB,
1611       G_READ_REGISTER,
1612       G_WRITE_REGISTER,
1613 
1614       G_SADDO, G_SSUBO,
1615 
1616        // TODO: Implement
1617       G_FMINIMUM, G_FMAXIMUM,
1618       G_FSHL
1619     }).lower();
1620 
1621   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1622         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1624     .unsupported();
1625 
1626   computeTables();
1627   verify(*ST.getInstrInfo());
1628 }
1629 
1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1631                                          MachineInstr &MI) const {
1632   MachineIRBuilder &B = Helper.MIRBuilder;
1633   MachineRegisterInfo &MRI = *B.getMRI();
1634 
1635   switch (MI.getOpcode()) {
1636   case TargetOpcode::G_ADDRSPACE_CAST:
1637     return legalizeAddrSpaceCast(MI, MRI, B);
1638   case TargetOpcode::G_FRINT:
1639     return legalizeFrint(MI, MRI, B);
1640   case TargetOpcode::G_FCEIL:
1641     return legalizeFceil(MI, MRI, B);
1642   case TargetOpcode::G_FREM:
1643     return legalizeFrem(MI, MRI, B);
1644   case TargetOpcode::G_INTRINSIC_TRUNC:
1645     return legalizeIntrinsicTrunc(MI, MRI, B);
1646   case TargetOpcode::G_SITOFP:
1647     return legalizeITOFP(MI, MRI, B, true);
1648   case TargetOpcode::G_UITOFP:
1649     return legalizeITOFP(MI, MRI, B, false);
1650   case TargetOpcode::G_FPTOSI:
1651     return legalizeFPTOI(MI, MRI, B, true);
1652   case TargetOpcode::G_FPTOUI:
1653     return legalizeFPTOI(MI, MRI, B, false);
1654   case TargetOpcode::G_FMINNUM:
1655   case TargetOpcode::G_FMAXNUM:
1656   case TargetOpcode::G_FMINNUM_IEEE:
1657   case TargetOpcode::G_FMAXNUM_IEEE:
1658     return legalizeMinNumMaxNum(Helper, MI);
1659   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1660     return legalizeExtractVectorElt(MI, MRI, B);
1661   case TargetOpcode::G_INSERT_VECTOR_ELT:
1662     return legalizeInsertVectorElt(MI, MRI, B);
1663   case TargetOpcode::G_SHUFFLE_VECTOR:
1664     return legalizeShuffleVector(MI, MRI, B);
1665   case TargetOpcode::G_FSIN:
1666   case TargetOpcode::G_FCOS:
1667     return legalizeSinCos(MI, MRI, B);
1668   case TargetOpcode::G_GLOBAL_VALUE:
1669     return legalizeGlobalValue(MI, MRI, B);
1670   case TargetOpcode::G_LOAD:
1671     return legalizeLoad(Helper, MI);
1672   case TargetOpcode::G_FMAD:
1673     return legalizeFMad(MI, MRI, B);
1674   case TargetOpcode::G_FDIV:
1675     return legalizeFDIV(MI, MRI, B);
1676   case TargetOpcode::G_UDIV:
1677   case TargetOpcode::G_UREM:
1678     return legalizeUDIV_UREM(MI, MRI, B);
1679   case TargetOpcode::G_SDIV:
1680   case TargetOpcode::G_SREM:
1681     return legalizeSDIV_SREM(MI, MRI, B);
1682   case TargetOpcode::G_ATOMIC_CMPXCHG:
1683     return legalizeAtomicCmpXChg(MI, MRI, B);
1684   case TargetOpcode::G_FLOG:
1685     return legalizeFlog(MI, B, numbers::ln2f);
1686   case TargetOpcode::G_FLOG10:
1687     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1688   case TargetOpcode::G_FEXP:
1689     return legalizeFExp(MI, B);
1690   case TargetOpcode::G_FPOW:
1691     return legalizeFPow(MI, B);
1692   case TargetOpcode::G_FFLOOR:
1693     return legalizeFFloor(MI, MRI, B);
1694   case TargetOpcode::G_BUILD_VECTOR:
1695     return legalizeBuildVector(MI, MRI, B);
1696   default:
1697     return false;
1698   }
1699 
1700   llvm_unreachable("expected switch to return");
1701 }
1702 
1703 Register AMDGPULegalizerInfo::getSegmentAperture(
1704   unsigned AS,
1705   MachineRegisterInfo &MRI,
1706   MachineIRBuilder &B) const {
1707   MachineFunction &MF = B.getMF();
1708   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1709   const LLT S32 = LLT::scalar(32);
1710 
1711   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1712 
1713   if (ST.hasApertureRegs()) {
1714     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1715     // getreg.
1716     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1717         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1718         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1719     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1720         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1721         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1722     unsigned Encoding =
1723         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1724         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1725         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1726 
1727     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1728 
1729     B.buildInstr(AMDGPU::S_GETREG_B32)
1730       .addDef(GetReg)
1731       .addImm(Encoding);
1732     MRI.setType(GetReg, S32);
1733 
1734     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1735     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1736   }
1737 
1738   Register QueuePtr = MRI.createGenericVirtualRegister(
1739     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1740 
1741   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1742     return Register();
1743 
1744   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1745   // private_segment_aperture_base_hi.
1746   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1747 
1748   // TODO: can we be smarter about machine pointer info?
1749   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1750   MachineMemOperand *MMO = MF.getMachineMemOperand(
1751       PtrInfo,
1752       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1753           MachineMemOperand::MOInvariant,
1754       4, commonAlignment(Align(64), StructOffset));
1755 
1756   Register LoadAddr;
1757 
1758   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1759   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1760 }
1761 
1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1763   MachineInstr &MI, MachineRegisterInfo &MRI,
1764   MachineIRBuilder &B) const {
1765   MachineFunction &MF = B.getMF();
1766 
1767   const LLT S32 = LLT::scalar(32);
1768   Register Dst = MI.getOperand(0).getReg();
1769   Register Src = MI.getOperand(1).getReg();
1770 
1771   LLT DstTy = MRI.getType(Dst);
1772   LLT SrcTy = MRI.getType(Src);
1773   unsigned DestAS = DstTy.getAddressSpace();
1774   unsigned SrcAS = SrcTy.getAddressSpace();
1775 
1776   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1777   // vector element.
1778   assert(!DstTy.isVector());
1779 
1780   const AMDGPUTargetMachine &TM
1781     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1782 
1783   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1784     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1785     return true;
1786   }
1787 
1788   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1789     // Truncate.
1790     B.buildExtract(Dst, Src, 0);
1791     MI.eraseFromParent();
1792     return true;
1793   }
1794 
1795   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1796     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1797     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1798 
1799     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1800     // another. Merge operands are required to be the same type, but creating an
1801     // extra ptrtoint would be kind of pointless.
1802     auto HighAddr = B.buildConstant(
1803       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1804     B.buildMerge(Dst, {Src, HighAddr});
1805     MI.eraseFromParent();
1806     return true;
1807   }
1808 
1809   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1810     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1811            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1812     unsigned NullVal = TM.getNullPointerValue(DestAS);
1813 
1814     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1815     auto FlatNull = B.buildConstant(SrcTy, 0);
1816 
1817     // Extract low 32-bits of the pointer.
1818     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1819 
1820     auto CmpRes =
1821         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1822     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1823 
1824     MI.eraseFromParent();
1825     return true;
1826   }
1827 
1828   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1829     return false;
1830 
1831   if (!ST.hasFlatAddressSpace())
1832     return false;
1833 
1834   auto SegmentNull =
1835       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1836   auto FlatNull =
1837       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1838 
1839   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1840   if (!ApertureReg.isValid())
1841     return false;
1842 
1843   auto CmpRes =
1844       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1845 
1846   // Coerce the type of the low half of the result so we can use merge_values.
1847   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1848 
1849   // TODO: Should we allow mismatched types but matching sizes in merges to
1850   // avoid the ptrtoint?
1851   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1852   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1853 
1854   MI.eraseFromParent();
1855   return true;
1856 }
1857 
1858 bool AMDGPULegalizerInfo::legalizeFrint(
1859   MachineInstr &MI, MachineRegisterInfo &MRI,
1860   MachineIRBuilder &B) const {
1861   Register Src = MI.getOperand(1).getReg();
1862   LLT Ty = MRI.getType(Src);
1863   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1864 
1865   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1866   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1867 
1868   auto C1 = B.buildFConstant(Ty, C1Val);
1869   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1870 
1871   // TODO: Should this propagate fast-math-flags?
1872   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1873   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1874 
1875   auto C2 = B.buildFConstant(Ty, C2Val);
1876   auto Fabs = B.buildFAbs(Ty, Src);
1877 
1878   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1879   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1880   MI.eraseFromParent();
1881   return true;
1882 }
1883 
1884 bool AMDGPULegalizerInfo::legalizeFceil(
1885   MachineInstr &MI, MachineRegisterInfo &MRI,
1886   MachineIRBuilder &B) const {
1887 
1888   const LLT S1 = LLT::scalar(1);
1889   const LLT S64 = LLT::scalar(64);
1890 
1891   Register Src = MI.getOperand(1).getReg();
1892   assert(MRI.getType(Src) == S64);
1893 
1894   // result = trunc(src)
1895   // if (src > 0.0 && src != result)
1896   //   result += 1.0
1897 
1898   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1899 
1900   const auto Zero = B.buildFConstant(S64, 0.0);
1901   const auto One = B.buildFConstant(S64, 1.0);
1902   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1903   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1904   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1905   auto Add = B.buildSelect(S64, And, One, Zero);
1906 
1907   // TODO: Should this propagate fast-math-flags?
1908   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1909   return true;
1910 }
1911 
1912 bool AMDGPULegalizerInfo::legalizeFrem(
1913   MachineInstr &MI, MachineRegisterInfo &MRI,
1914   MachineIRBuilder &B) const {
1915     Register DstReg = MI.getOperand(0).getReg();
1916     Register Src0Reg = MI.getOperand(1).getReg();
1917     Register Src1Reg = MI.getOperand(2).getReg();
1918     auto Flags = MI.getFlags();
1919     LLT Ty = MRI.getType(DstReg);
1920 
1921     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1923     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1924     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925     MI.eraseFromParent();
1926     return true;
1927 }
1928 
1929 static MachineInstrBuilder extractF64Exponent(Register Hi,
1930                                               MachineIRBuilder &B) {
1931   const unsigned FractBits = 52;
1932   const unsigned ExpBits = 11;
1933   LLT S32 = LLT::scalar(32);
1934 
1935   auto Const0 = B.buildConstant(S32, FractBits - 32);
1936   auto Const1 = B.buildConstant(S32, ExpBits);
1937 
1938   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1939     .addUse(Hi)
1940     .addUse(Const0.getReg(0))
1941     .addUse(Const1.getReg(0));
1942 
1943   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1947   MachineInstr &MI, MachineRegisterInfo &MRI,
1948   MachineIRBuilder &B) const {
1949   const LLT S1 = LLT::scalar(1);
1950   const LLT S32 = LLT::scalar(32);
1951   const LLT S64 = LLT::scalar(64);
1952 
1953   Register Src = MI.getOperand(1).getReg();
1954   assert(MRI.getType(Src) == S64);
1955 
1956   // TODO: Should this use extract since the low half is unused?
1957   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1958   Register Hi = Unmerge.getReg(1);
1959 
1960   // Extract the upper half, since this is where we will find the sign and
1961   // exponent.
1962   auto Exp = extractF64Exponent(Hi, B);
1963 
1964   const unsigned FractBits = 52;
1965 
1966   // Extract the sign bit.
1967   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1968   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1969 
1970   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1971 
1972   const auto Zero32 = B.buildConstant(S32, 0);
1973 
1974   // Extend back to 64-bits.
1975   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1976 
1977   auto Shr = B.buildAShr(S64, FractMask, Exp);
1978   auto Not = B.buildNot(S64, Shr);
1979   auto Tmp0 = B.buildAnd(S64, Src, Not);
1980   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1981 
1982   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1983   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1984 
1985   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1986   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987   MI.eraseFromParent();
1988   return true;
1989 }
1990 
1991 bool AMDGPULegalizerInfo::legalizeITOFP(
1992   MachineInstr &MI, MachineRegisterInfo &MRI,
1993   MachineIRBuilder &B, bool Signed) const {
1994 
1995   Register Dst = MI.getOperand(0).getReg();
1996   Register Src = MI.getOperand(1).getReg();
1997 
1998   const LLT S64 = LLT::scalar(64);
1999   const LLT S32 = LLT::scalar(32);
2000 
2001   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2002 
2003   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2004 
2005   auto CvtHi = Signed ?
2006     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2007     B.buildUITOFP(S64, Unmerge.getReg(1));
2008 
2009   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2010 
2011   auto ThirtyTwo = B.buildConstant(S32, 32);
2012   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2013     .addUse(CvtHi.getReg(0))
2014     .addUse(ThirtyTwo.getReg(0));
2015 
2016   // TODO: Should this propagate fast-math-flags?
2017   B.buildFAdd(Dst, LdExp, CvtLo);
2018   MI.eraseFromParent();
2019   return true;
2020 }
2021 
2022 // TODO: Copied from DAG implementation. Verify logic and document how this
2023 // actually works.
2024 bool AMDGPULegalizerInfo::legalizeFPTOI(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B, bool Signed) const {
2027 
2028   Register Dst = MI.getOperand(0).getReg();
2029   Register Src = MI.getOperand(1).getReg();
2030 
2031   const LLT S64 = LLT::scalar(64);
2032   const LLT S32 = LLT::scalar(32);
2033 
2034   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2035 
2036   unsigned Flags = MI.getFlags();
2037 
2038   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2039   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2040   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2041 
2042   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2043   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2044   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2045 
2046   auto Hi = Signed ?
2047     B.buildFPTOSI(S32, FloorMul) :
2048     B.buildFPTOUI(S32, FloorMul);
2049   auto Lo = B.buildFPTOUI(S32, Fma);
2050 
2051   B.buildMerge(Dst, { Lo, Hi });
2052   MI.eraseFromParent();
2053 
2054   return true;
2055 }
2056 
2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2058                                                MachineInstr &MI) const {
2059   MachineFunction &MF = Helper.MIRBuilder.getMF();
2060   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061 
2062   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2063                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2064 
2065   // With ieee_mode disabled, the instructions have the correct behavior
2066   // already for G_FMINNUM/G_FMAXNUM
2067   if (!MFI->getMode().IEEE)
2068     return !IsIEEEOp;
2069 
2070   if (IsIEEEOp)
2071     return true;
2072 
2073   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2074 }
2075 
2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2077   MachineInstr &MI, MachineRegisterInfo &MRI,
2078   MachineIRBuilder &B) const {
2079   // TODO: Should move some of this into LegalizerHelper.
2080 
2081   // TODO: Promote dynamic indexing of s16 to s32
2082 
2083   // FIXME: Artifact combiner probably should have replaced the truncated
2084   // constant before this, so we shouldn't need
2085   // getConstantVRegValWithLookThrough.
2086   Optional<ValueAndVReg> MaybeIdxVal =
2087       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2088   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2089     return true;
2090   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2091 
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register Vec = MI.getOperand(1).getReg();
2094 
2095   LLT VecTy = MRI.getType(Vec);
2096   LLT EltTy = VecTy.getElementType();
2097   assert(EltTy == MRI.getType(Dst));
2098 
2099   if (IdxVal < VecTy.getNumElements())
2100     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2101   else
2102     B.buildUndef(Dst);
2103 
2104   MI.eraseFromParent();
2105   return true;
2106 }
2107 
2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2109   MachineInstr &MI, MachineRegisterInfo &MRI,
2110   MachineIRBuilder &B) const {
2111   // TODO: Should move some of this into LegalizerHelper.
2112 
2113   // TODO: Promote dynamic indexing of s16 to s32
2114 
2115   // FIXME: Artifact combiner probably should have replaced the truncated
2116   // constant before this, so we shouldn't need
2117   // getConstantVRegValWithLookThrough.
2118   Optional<ValueAndVReg> MaybeIdxVal =
2119       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2120   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2121     return true;
2122 
2123   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2124   Register Dst = MI.getOperand(0).getReg();
2125   Register Vec = MI.getOperand(1).getReg();
2126   Register Ins = MI.getOperand(2).getReg();
2127 
2128   LLT VecTy = MRI.getType(Vec);
2129   LLT EltTy = VecTy.getElementType();
2130   assert(EltTy == MRI.getType(Ins));
2131 
2132   if (IdxVal < VecTy.getNumElements())
2133     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2134   else
2135     B.buildUndef(Dst);
2136 
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2142   MachineInstr &MI, MachineRegisterInfo &MRI,
2143   MachineIRBuilder &B) const {
2144   const LLT V2S16 = LLT::vector(2, 16);
2145 
2146   Register Dst = MI.getOperand(0).getReg();
2147   Register Src0 = MI.getOperand(1).getReg();
2148   LLT DstTy = MRI.getType(Dst);
2149   LLT SrcTy = MRI.getType(Src0);
2150 
2151   if (SrcTy == V2S16 && DstTy == V2S16 &&
2152       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2153     return true;
2154 
2155   MachineIRBuilder HelperBuilder(MI);
2156   GISelObserverWrapper DummyObserver;
2157   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2158   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2159 }
2160 
2161 bool AMDGPULegalizerInfo::legalizeSinCos(
2162   MachineInstr &MI, MachineRegisterInfo &MRI,
2163   MachineIRBuilder &B) const {
2164 
2165   Register DstReg = MI.getOperand(0).getReg();
2166   Register SrcReg = MI.getOperand(1).getReg();
2167   LLT Ty = MRI.getType(DstReg);
2168   unsigned Flags = MI.getFlags();
2169 
2170   Register TrigVal;
2171   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2172   if (ST.hasTrigReducedRange()) {
2173     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2174     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2175       .addUse(MulVal.getReg(0))
2176       .setMIFlags(Flags).getReg(0);
2177   } else
2178     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2179 
2180   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2181     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2182   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2183     .addUse(TrigVal)
2184     .setMIFlags(Flags);
2185   MI.eraseFromParent();
2186   return true;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2190                                                   MachineIRBuilder &B,
2191                                                   const GlobalValue *GV,
2192                                                   int64_t Offset,
2193                                                   unsigned GAFlags) const {
2194   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2195   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2196   // to the following code sequence:
2197   //
2198   // For constant address space:
2199   //   s_getpc_b64 s[0:1]
2200   //   s_add_u32 s0, s0, $symbol
2201   //   s_addc_u32 s1, s1, 0
2202   //
2203   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204   //   a fixup or relocation is emitted to replace $symbol with a literal
2205   //   constant, which is a pc-relative offset from the encoding of the $symbol
2206   //   operand to the global variable.
2207   //
2208   // For global address space:
2209   //   s_getpc_b64 s[0:1]
2210   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2211   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2212   //
2213   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2214   //   fixups or relocations are emitted to replace $symbol@*@lo and
2215   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2216   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2217   //   operand to the global variable.
2218   //
2219   // What we want here is an offset from the value returned by s_getpc
2220   // (which is the address of the s_add_u32 instruction) to the global
2221   // variable, but since the encoding of $symbol starts 4 bytes after the start
2222   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2223   // small. This requires us to add 4 to the global variable offset in order to
2224   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2225   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2226   // instruction.
2227 
2228   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2229 
2230   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2231     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2232 
2233   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2234     .addDef(PCReg);
2235 
2236   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2237   if (GAFlags == SIInstrInfo::MO_NONE)
2238     MIB.addImm(0);
2239   else
2240     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2241 
2242   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2243 
2244   if (PtrTy.getSizeInBits() == 32)
2245     B.buildExtract(DstReg, PCReg, 0);
2246   return true;
2247  }
2248 
2249 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2250   MachineInstr &MI, MachineRegisterInfo &MRI,
2251   MachineIRBuilder &B) const {
2252   Register DstReg = MI.getOperand(0).getReg();
2253   LLT Ty = MRI.getType(DstReg);
2254   unsigned AS = Ty.getAddressSpace();
2255 
2256   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2257   MachineFunction &MF = B.getMF();
2258   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2259 
2260   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2261     if (!MFI->isModuleEntryFunction()) {
2262       const Function &Fn = MF.getFunction();
2263       DiagnosticInfoUnsupported BadLDSDecl(
2264         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2265         DS_Warning);
2266       Fn.getContext().diagnose(BadLDSDecl);
2267 
2268       // We currently don't have a way to correctly allocate LDS objects that
2269       // aren't directly associated with a kernel. We do force inlining of
2270       // functions that use local objects. However, if these dead functions are
2271       // not eliminated, we don't want a compile time error. Just emit a warning
2272       // and a trap, since there should be no callable path here.
2273       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2274       B.buildUndef(DstReg);
2275       MI.eraseFromParent();
2276       return true;
2277     }
2278 
2279     // TODO: We could emit code to handle the initialization somewhere.
2280     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2281       const SITargetLowering *TLI = ST.getTargetLowering();
2282       if (!TLI->shouldUseLDSConstAddress(GV)) {
2283         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2284         return true; // Leave in place;
2285       }
2286 
2287       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2288         Type *Ty = GV->getValueType();
2289         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2290         // zero-sized type in other languages to declare the dynamic shared
2291         // memory which size is not known at the compile time. They will be
2292         // allocated by the runtime and placed directly after the static
2293         // allocated ones. They all share the same offset.
2294         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2295           // Adjust alignment for that dynamic shared memory array.
2296           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2297           LLT S32 = LLT::scalar(32);
2298           auto Sz =
2299               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2300           B.buildIntToPtr(DstReg, Sz);
2301           MI.eraseFromParent();
2302           return true;
2303         }
2304       }
2305 
2306       B.buildConstant(
2307           DstReg,
2308           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2309       MI.eraseFromParent();
2310       return true;
2311     }
2312 
2313     const Function &Fn = MF.getFunction();
2314     DiagnosticInfoUnsupported BadInit(
2315       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2316     Fn.getContext().diagnose(BadInit);
2317     return true;
2318   }
2319 
2320   const SITargetLowering *TLI = ST.getTargetLowering();
2321 
2322   if (TLI->shouldEmitFixup(GV)) {
2323     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2324     MI.eraseFromParent();
2325     return true;
2326   }
2327 
2328   if (TLI->shouldEmitPCReloc(GV)) {
2329     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2330     MI.eraseFromParent();
2331     return true;
2332   }
2333 
2334   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2335   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2336 
2337   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2338       MachinePointerInfo::getGOT(MF),
2339       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2340           MachineMemOperand::MOInvariant,
2341       8 /*Size*/, Align(8));
2342 
2343   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2344 
2345   if (Ty.getSizeInBits() == 32) {
2346     // Truncate if this is a 32-bit constant adrdess.
2347     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2348     B.buildExtract(DstReg, Load, 0);
2349   } else
2350     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2351 
2352   MI.eraseFromParent();
2353   return true;
2354 }
2355 
2356 static LLT widenToNextPowerOf2(LLT Ty) {
2357   if (Ty.isVector())
2358     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2359   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2360 }
2361 
2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2363                                        MachineInstr &MI) const {
2364   MachineIRBuilder &B = Helper.MIRBuilder;
2365   MachineRegisterInfo &MRI = *B.getMRI();
2366   GISelChangeObserver &Observer = Helper.Observer;
2367 
2368   Register PtrReg = MI.getOperand(1).getReg();
2369   LLT PtrTy = MRI.getType(PtrReg);
2370   unsigned AddrSpace = PtrTy.getAddressSpace();
2371 
2372   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2373     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2374     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2375     Observer.changingInstr(MI);
2376     MI.getOperand(1).setReg(Cast.getReg(0));
2377     Observer.changedInstr(MI);
2378     return true;
2379   }
2380 
2381   Register ValReg = MI.getOperand(0).getReg();
2382   LLT ValTy = MRI.getType(ValReg);
2383 
2384   MachineMemOperand *MMO = *MI.memoperands_begin();
2385   const unsigned ValSize = ValTy.getSizeInBits();
2386   const unsigned MemSize = 8 * MMO->getSize();
2387   const Align MemAlign = MMO->getAlign();
2388   const unsigned AlignInBits = 8 * MemAlign.value();
2389 
2390   // Widen non-power-of-2 loads to the alignment if needed
2391   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2392     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2393 
2394     // This was already the correct extending load result type, so just adjust
2395     // the memory type.
2396     if (WideMemSize == ValSize) {
2397       MachineFunction &MF = B.getMF();
2398 
2399       MachineMemOperand *WideMMO =
2400           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2401       Observer.changingInstr(MI);
2402       MI.setMemRefs(MF, {WideMMO});
2403       Observer.changedInstr(MI);
2404       return true;
2405     }
2406 
2407     // Don't bother handling edge case that should probably never be produced.
2408     if (ValSize > WideMemSize)
2409       return false;
2410 
2411     LLT WideTy = widenToNextPowerOf2(ValTy);
2412 
2413     Register WideLoad;
2414     if (!WideTy.isVector()) {
2415       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416       B.buildTrunc(ValReg, WideLoad).getReg(0);
2417     } else {
2418       // Extract the subvector.
2419 
2420       if (isRegisterType(ValTy)) {
2421         // If this a case where G_EXTRACT is legal, use it.
2422         // (e.g. <3 x s32> -> <4 x s32>)
2423         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424         B.buildExtract(ValReg, WideLoad, 0);
2425       } else {
2426         // For cases where the widened type isn't a nice register value, unmerge
2427         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2428         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2429         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2430         B.setInsertPt(B.getMBB(), MI.getIterator());
2431         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2432       }
2433     }
2434 
2435     MI.eraseFromParent();
2436     return true;
2437   }
2438 
2439   return false;
2440 }
2441 
2442 bool AMDGPULegalizerInfo::legalizeFMad(
2443   MachineInstr &MI, MachineRegisterInfo &MRI,
2444   MachineIRBuilder &B) const {
2445   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2446   assert(Ty.isScalar());
2447 
2448   MachineFunction &MF = B.getMF();
2449   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2450 
2451   // TODO: Always legal with future ftz flag.
2452   // FIXME: Do we need just output?
2453   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2454     return true;
2455   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2456     return true;
2457 
2458   MachineIRBuilder HelperBuilder(MI);
2459   GISelObserverWrapper DummyObserver;
2460   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2461   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2462 }
2463 
2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2465   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2466   Register DstReg = MI.getOperand(0).getReg();
2467   Register PtrReg = MI.getOperand(1).getReg();
2468   Register CmpVal = MI.getOperand(2).getReg();
2469   Register NewVal = MI.getOperand(3).getReg();
2470 
2471   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2472          "this should not have been custom lowered");
2473 
2474   LLT ValTy = MRI.getType(CmpVal);
2475   LLT VecTy = LLT::vector(2, ValTy);
2476 
2477   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2478 
2479   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2480     .addDef(DstReg)
2481     .addUse(PtrReg)
2482     .addUse(PackedVal)
2483     .setMemRefs(MI.memoperands());
2484 
2485   MI.eraseFromParent();
2486   return true;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizeFlog(
2490   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2491   Register Dst = MI.getOperand(0).getReg();
2492   Register Src = MI.getOperand(1).getReg();
2493   LLT Ty = B.getMRI()->getType(Dst);
2494   unsigned Flags = MI.getFlags();
2495 
2496   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2497   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2498 
2499   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2505                                        MachineIRBuilder &B) const {
2506   Register Dst = MI.getOperand(0).getReg();
2507   Register Src = MI.getOperand(1).getReg();
2508   unsigned Flags = MI.getFlags();
2509   LLT Ty = B.getMRI()->getType(Dst);
2510 
2511   auto K = B.buildFConstant(Ty, numbers::log2e);
2512   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2513   B.buildFExp2(Dst, Mul, Flags);
2514   MI.eraseFromParent();
2515   return true;
2516 }
2517 
2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2519                                        MachineIRBuilder &B) const {
2520   Register Dst = MI.getOperand(0).getReg();
2521   Register Src0 = MI.getOperand(1).getReg();
2522   Register Src1 = MI.getOperand(2).getReg();
2523   unsigned Flags = MI.getFlags();
2524   LLT Ty = B.getMRI()->getType(Dst);
2525   const LLT S16 = LLT::scalar(16);
2526   const LLT S32 = LLT::scalar(32);
2527 
2528   if (Ty == S32) {
2529     auto Log = B.buildFLog2(S32, Src0, Flags);
2530     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531       .addUse(Log.getReg(0))
2532       .addUse(Src1)
2533       .setMIFlags(Flags);
2534     B.buildFExp2(Dst, Mul, Flags);
2535   } else if (Ty == S16) {
2536     // There's no f16 fmul_legacy, so we need to convert for it.
2537     auto Log = B.buildFLog2(S16, Src0, Flags);
2538     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2539     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2540     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2541       .addUse(Ext0.getReg(0))
2542       .addUse(Ext1.getReg(0))
2543       .setMIFlags(Flags);
2544 
2545     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2546   } else
2547     return false;
2548 
2549   MI.eraseFromParent();
2550   return true;
2551 }
2552 
2553 // Find a source register, ignoring any possible source modifiers.
2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2555   Register ModSrc = OrigSrc;
2556   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2557     ModSrc = SrcFNeg->getOperand(1).getReg();
2558     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2559       ModSrc = SrcFAbs->getOperand(1).getReg();
2560   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561     ModSrc = SrcFAbs->getOperand(1).getReg();
2562   return ModSrc;
2563 }
2564 
2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2566                                          MachineRegisterInfo &MRI,
2567                                          MachineIRBuilder &B) const {
2568 
2569   const LLT S1 = LLT::scalar(1);
2570   const LLT S64 = LLT::scalar(64);
2571   Register Dst = MI.getOperand(0).getReg();
2572   Register OrigSrc = MI.getOperand(1).getReg();
2573   unsigned Flags = MI.getFlags();
2574   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2575          "this should not have been custom lowered");
2576 
2577   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2578   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2579   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2580   // V_FRACT bug is:
2581   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2582   //
2583   // Convert floor(x) to (x - fract(x))
2584 
2585   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2586     .addUse(OrigSrc)
2587     .setMIFlags(Flags);
2588 
2589   // Give source modifier matching some assistance before obscuring a foldable
2590   // pattern.
2591 
2592   // TODO: We can avoid the neg on the fract? The input sign to fract
2593   // shouldn't matter?
2594   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2595 
2596   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2597 
2598   Register Min = MRI.createGenericVirtualRegister(S64);
2599 
2600   // We don't need to concern ourselves with the snan handling difference, so
2601   // use the one which will directly select.
2602   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2603   if (MFI->getMode().IEEE)
2604     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2605   else
2606     B.buildFMinNum(Min, Fract, Const, Flags);
2607 
2608   Register CorrectedFract = Min;
2609   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2610     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2611     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2612   }
2613 
2614   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2615   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2616 
2617   MI.eraseFromParent();
2618   return true;
2619 }
2620 
2621 // Turn an illegal packed v2s16 build vector into bit operations.
2622 // TODO: This should probably be a bitcast action in LegalizerHelper.
2623 bool AMDGPULegalizerInfo::legalizeBuildVector(
2624   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2625   Register Dst = MI.getOperand(0).getReg();
2626   const LLT S32 = LLT::scalar(32);
2627   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2628 
2629   Register Src0 = MI.getOperand(1).getReg();
2630   Register Src1 = MI.getOperand(2).getReg();
2631   assert(MRI.getType(Src0) == LLT::scalar(16));
2632 
2633   auto Merge = B.buildMerge(S32, {Src0, Src1});
2634   B.buildBitcast(Dst, Merge);
2635 
2636   MI.eraseFromParent();
2637   return true;
2638 }
2639 
2640 // Check that this is a G_XOR x, -1
2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2642   if (MI.getOpcode() != TargetOpcode::G_XOR)
2643     return false;
2644   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2645   return ConstVal && *ConstVal == -1;
2646 }
2647 
2648 // Return the use branch instruction, otherwise null if the usage is invalid.
2649 static MachineInstr *
2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2651                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2652   Register CondDef = MI.getOperand(0).getReg();
2653   if (!MRI.hasOneNonDBGUse(CondDef))
2654     return nullptr;
2655 
2656   MachineBasicBlock *Parent = MI.getParent();
2657   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2658 
2659   if (isNot(MRI, *UseMI)) {
2660     Register NegatedCond = UseMI->getOperand(0).getReg();
2661     if (!MRI.hasOneNonDBGUse(NegatedCond))
2662       return nullptr;
2663 
2664     // We're deleting the def of this value, so we need to remove it.
2665     UseMI->eraseFromParent();
2666 
2667     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2668     Negated = true;
2669   }
2670 
2671   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2672     return nullptr;
2673 
2674   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2675   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2676   if (Next == Parent->end()) {
2677     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2678     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2679       return nullptr;
2680     UncondBrTarget = &*NextMBB;
2681   } else {
2682     if (Next->getOpcode() != AMDGPU::G_BR)
2683       return nullptr;
2684     Br = &*Next;
2685     UncondBrTarget = Br->getOperand(0).getMBB();
2686   }
2687 
2688   return UseMI;
2689 }
2690 
2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2692                                          const ArgDescriptor *Arg,
2693                                          const TargetRegisterClass *ArgRC,
2694                                          LLT ArgTy) const {
2695   MCRegister SrcReg = Arg->getRegister();
2696   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2697   assert(DstReg.isVirtual() && "Virtual register expected");
2698 
2699   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2700                                              ArgTy);
2701   if (Arg->isMasked()) {
2702     // TODO: Should we try to emit this once in the entry block?
2703     const LLT S32 = LLT::scalar(32);
2704     const unsigned Mask = Arg->getMask();
2705     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2706 
2707     Register AndMaskSrc = LiveIn;
2708 
2709     if (Shift != 0) {
2710       auto ShiftAmt = B.buildConstant(S32, Shift);
2711       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2712     }
2713 
2714     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2715   } else {
2716     B.buildCopy(DstReg, LiveIn);
2717   }
2718 
2719   return true;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::loadInputValue(
2723     Register DstReg, MachineIRBuilder &B,
2724     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2725   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2726   const ArgDescriptor *Arg;
2727   const TargetRegisterClass *ArgRC;
2728   LLT ArgTy;
2729   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2730 
2731   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2732     return false; // TODO: Handle these
2733   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2734 }
2735 
2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2737     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2738     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2739   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2740     return false;
2741 
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2747                                        MachineRegisterInfo &MRI,
2748                                        MachineIRBuilder &B) const {
2749   Register Dst = MI.getOperand(0).getReg();
2750   LLT DstTy = MRI.getType(Dst);
2751   LLT S16 = LLT::scalar(16);
2752   LLT S32 = LLT::scalar(32);
2753   LLT S64 = LLT::scalar(64);
2754 
2755   if (DstTy == S16)
2756     return legalizeFDIV16(MI, MRI, B);
2757   if (DstTy == S32)
2758     return legalizeFDIV32(MI, MRI, B);
2759   if (DstTy == S64)
2760     return legalizeFDIV64(MI, MRI, B);
2761 
2762   return false;
2763 }
2764 
2765 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2766                                                   Register DstReg,
2767                                                   Register X,
2768                                                   Register Y,
2769                                                   bool IsDiv) const {
2770   const LLT S1 = LLT::scalar(1);
2771   const LLT S32 = LLT::scalar(32);
2772 
2773   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2774   // algorithm used here.
2775 
2776   // Initial estimate of inv(y).
2777   auto FloatY = B.buildUITOFP(S32, Y);
2778   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2779   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2780   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2781   auto Z = B.buildFPTOUI(S32, ScaledY);
2782 
2783   // One round of UNR.
2784   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2785   auto NegYZ = B.buildMul(S32, NegY, Z);
2786   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2787 
2788   // Quotient/remainder estimate.
2789   auto Q = B.buildUMulH(S32, X, Z);
2790   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2791 
2792   // First quotient/remainder refinement.
2793   auto One = B.buildConstant(S32, 1);
2794   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2795   if (IsDiv)
2796     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2797   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2798 
2799   // Second quotient/remainder refinement.
2800   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2801   if (IsDiv)
2802     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2803   else
2804     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2805 }
2806 
2807 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2808                                               MachineRegisterInfo &MRI,
2809                                               MachineIRBuilder &B) const {
2810   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2811   Register DstReg = MI.getOperand(0).getReg();
2812   Register Num = MI.getOperand(1).getReg();
2813   Register Den = MI.getOperand(2).getReg();
2814   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2815   MI.eraseFromParent();
2816   return true;
2817 }
2818 
2819 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2820 //
2821 // Return lo, hi of result
2822 //
2823 // %cvt.lo = G_UITOFP Val.lo
2824 // %cvt.hi = G_UITOFP Val.hi
2825 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2826 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2827 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2828 // %mul2 = G_FMUL %mul1, 2**(-32)
2829 // %trunc = G_INTRINSIC_TRUNC %mul2
2830 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2831 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2832 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2833                                                        Register Val) {
2834   const LLT S32 = LLT::scalar(32);
2835   auto Unmerge = B.buildUnmerge(S32, Val);
2836 
2837   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2838   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2839 
2840   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2841                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2842 
2843   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2844   auto Mul1 =
2845       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2846 
2847   // 2**(-32)
2848   auto Mul2 =
2849       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2850   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2851 
2852   // -(2**32)
2853   auto Mad2 = B.buildFMAD(S32, Trunc,
2854                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2855 
2856   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2857   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2858 
2859   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2860 }
2861 
2862 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2863                                                   Register DstReg,
2864                                                   Register Numer,
2865                                                   Register Denom,
2866                                                   bool IsDiv) const {
2867   const LLT S32 = LLT::scalar(32);
2868   const LLT S64 = LLT::scalar(64);
2869   const LLT S1 = LLT::scalar(1);
2870   Register RcpLo, RcpHi;
2871 
2872   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2873 
2874   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2875 
2876   auto Zero64 = B.buildConstant(S64, 0);
2877   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2878 
2879   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2880   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2881 
2882   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2883   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2884   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2885 
2886   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2887   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2888   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2889   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2890 
2891   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2892   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2893   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2894   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2895   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2896 
2897   auto Zero32 = B.buildConstant(S32, 0);
2898   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2899   auto Add2_HiC =
2900       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2901   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2902   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2903 
2904   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2905   Register NumerLo = UnmergeNumer.getReg(0);
2906   Register NumerHi = UnmergeNumer.getReg(1);
2907 
2908   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2909   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2910   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2911   Register Mul3_Lo = UnmergeMul3.getReg(0);
2912   Register Mul3_Hi = UnmergeMul3.getReg(1);
2913   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2914   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2915   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2916   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2917 
2918   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2919   Register DenomLo = UnmergeDenom.getReg(0);
2920   Register DenomHi = UnmergeDenom.getReg(1);
2921 
2922   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2923   auto C1 = B.buildSExt(S32, CmpHi);
2924 
2925   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2926   auto C2 = B.buildSExt(S32, CmpLo);
2927 
2928   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2929   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2930 
2931   // TODO: Here and below portions of the code can be enclosed into if/endif.
2932   // Currently control flow is unconditional and we have 4 selects after
2933   // potential endif to substitute PHIs.
2934 
2935   // if C3 != 0 ...
2936   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2937   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2938   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2939   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2940 
2941   auto One64 = B.buildConstant(S64, 1);
2942   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2943 
2944   auto C4 =
2945       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2946   auto C5 =
2947       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2948   auto C6 = B.buildSelect(
2949       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2950 
2951   // if (C6 != 0)
2952   auto Add4 = B.buildAdd(S64, Add3, One64);
2953   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2954 
2955   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2956   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2957   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2958 
2959   // endif C6
2960   // endif C3
2961 
2962   if (IsDiv) {
2963     auto Sel1 = B.buildSelect(
2964         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2965     B.buildSelect(DstReg,
2966                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2967   } else {
2968     auto Sel2 = B.buildSelect(
2969         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2970     B.buildSelect(DstReg,
2971                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2972   }
2973 }
2974 
2975 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2976                                             MachineRegisterInfo &MRI,
2977                                             MachineIRBuilder &B) const {
2978   const LLT S64 = LLT::scalar(64);
2979   const LLT S32 = LLT::scalar(32);
2980   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2981   Register DstReg = MI.getOperand(0).getReg();
2982   Register Num = MI.getOperand(1).getReg();
2983   Register Den = MI.getOperand(2).getReg();
2984   LLT Ty = MRI.getType(DstReg);
2985 
2986   if (Ty == S32)
2987     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2988   else if (Ty == S64)
2989     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2990   else
2991     return false;
2992 
2993   MI.eraseFromParent();
2994   return true;
2995 
2996 }
2997 
2998 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2999                                             MachineRegisterInfo &MRI,
3000                                             MachineIRBuilder &B) const {
3001   const LLT S64 = LLT::scalar(64);
3002   const LLT S32 = LLT::scalar(32);
3003 
3004   Register DstReg = MI.getOperand(0).getReg();
3005   const LLT Ty = MRI.getType(DstReg);
3006   if (Ty != S32 && Ty != S64)
3007     return false;
3008 
3009   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3010 
3011   Register LHS = MI.getOperand(1).getReg();
3012   Register RHS = MI.getOperand(2).getReg();
3013 
3014   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3015   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3016   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3017 
3018   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3019   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3020 
3021   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3022   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3023 
3024   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3025   if (Ty == S32)
3026     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3027   else
3028     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3029 
3030   Register Sign;
3031   if (IsDiv)
3032     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3033   else
3034     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3035 
3036   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3037   B.buildSub(DstReg, UDivRem, Sign);
3038 
3039   MI.eraseFromParent();
3040   return true;
3041 }
3042 
3043 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3044                                                  MachineRegisterInfo &MRI,
3045                                                  MachineIRBuilder &B) const {
3046   Register Res = MI.getOperand(0).getReg();
3047   Register LHS = MI.getOperand(1).getReg();
3048   Register RHS = MI.getOperand(2).getReg();
3049   uint16_t Flags = MI.getFlags();
3050   LLT ResTy = MRI.getType(Res);
3051 
3052   const MachineFunction &MF = B.getMF();
3053   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3054                             MI.getFlag(MachineInstr::FmAfn);
3055 
3056   if (!AllowInaccurateRcp)
3057     return false;
3058 
3059   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3060     // 1 / x -> RCP(x)
3061     if (CLHS->isExactlyValue(1.0)) {
3062       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3063         .addUse(RHS)
3064         .setMIFlags(Flags);
3065 
3066       MI.eraseFromParent();
3067       return true;
3068     }
3069 
3070     // -1 / x -> RCP( FNEG(x) )
3071     if (CLHS->isExactlyValue(-1.0)) {
3072       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3073       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3074         .addUse(FNeg.getReg(0))
3075         .setMIFlags(Flags);
3076 
3077       MI.eraseFromParent();
3078       return true;
3079     }
3080   }
3081 
3082   // x / y -> x * (1.0 / y)
3083   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3084     .addUse(RHS)
3085     .setMIFlags(Flags);
3086   B.buildFMul(Res, LHS, RCP, Flags);
3087 
3088   MI.eraseFromParent();
3089   return true;
3090 }
3091 
3092 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3093                                                    MachineRegisterInfo &MRI,
3094                                                    MachineIRBuilder &B) const {
3095   Register Res = MI.getOperand(0).getReg();
3096   Register X = MI.getOperand(1).getReg();
3097   Register Y = MI.getOperand(2).getReg();
3098   uint16_t Flags = MI.getFlags();
3099   LLT ResTy = MRI.getType(Res);
3100 
3101   const MachineFunction &MF = B.getMF();
3102   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3103                             MI.getFlag(MachineInstr::FmAfn);
3104 
3105   if (!AllowInaccurateRcp)
3106     return false;
3107 
3108   auto NegY = B.buildFNeg(ResTy, Y);
3109   auto One = B.buildFConstant(ResTy, 1.0);
3110 
3111   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3112     .addUse(Y)
3113     .setMIFlags(Flags);
3114 
3115   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3116   R = B.buildFMA(ResTy, Tmp0, R, R);
3117 
3118   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3119   R = B.buildFMA(ResTy, Tmp1, R, R);
3120 
3121   auto Ret = B.buildFMul(ResTy, X, R);
3122   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3123 
3124   B.buildFMA(Res, Tmp2, R, Ret);
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3130                                          MachineRegisterInfo &MRI,
3131                                          MachineIRBuilder &B) const {
3132   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3133     return true;
3134 
3135   Register Res = MI.getOperand(0).getReg();
3136   Register LHS = MI.getOperand(1).getReg();
3137   Register RHS = MI.getOperand(2).getReg();
3138 
3139   uint16_t Flags = MI.getFlags();
3140 
3141   LLT S16 = LLT::scalar(16);
3142   LLT S32 = LLT::scalar(32);
3143 
3144   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3145   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3146 
3147   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3148     .addUse(RHSExt.getReg(0))
3149     .setMIFlags(Flags);
3150 
3151   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3152   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3153 
3154   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3155     .addUse(RDst.getReg(0))
3156     .addUse(RHS)
3157     .addUse(LHS)
3158     .setMIFlags(Flags);
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3165 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3166 static void toggleSPDenormMode(bool Enable,
3167                                MachineIRBuilder &B,
3168                                const GCNSubtarget &ST,
3169                                AMDGPU::SIModeRegisterDefaults Mode) {
3170   // Set SP denorm mode to this value.
3171   unsigned SPDenormMode =
3172     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3173 
3174   if (ST.hasDenormModeInst()) {
3175     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3176     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3177 
3178     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3179     B.buildInstr(AMDGPU::S_DENORM_MODE)
3180       .addImm(NewDenormModeValue);
3181 
3182   } else {
3183     // Select FP32 bit field in mode register.
3184     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3185                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3186                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3187 
3188     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3189       .addImm(SPDenormMode)
3190       .addImm(SPDenormModeBitField);
3191   }
3192 }
3193 
3194 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3195                                          MachineRegisterInfo &MRI,
3196                                          MachineIRBuilder &B) const {
3197   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3198     return true;
3199 
3200   Register Res = MI.getOperand(0).getReg();
3201   Register LHS = MI.getOperand(1).getReg();
3202   Register RHS = MI.getOperand(2).getReg();
3203   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3204   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3205 
3206   uint16_t Flags = MI.getFlags();
3207 
3208   LLT S32 = LLT::scalar(32);
3209   LLT S1 = LLT::scalar(1);
3210 
3211   auto One = B.buildFConstant(S32, 1.0f);
3212 
3213   auto DenominatorScaled =
3214     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3215       .addUse(LHS)
3216       .addUse(RHS)
3217       .addImm(0)
3218       .setMIFlags(Flags);
3219   auto NumeratorScaled =
3220     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3221       .addUse(LHS)
3222       .addUse(RHS)
3223       .addImm(1)
3224       .setMIFlags(Flags);
3225 
3226   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3227     .addUse(DenominatorScaled.getReg(0))
3228     .setMIFlags(Flags);
3229   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3230 
3231   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3232   // aren't modeled as reading it.
3233   if (!Mode.allFP32Denormals())
3234     toggleSPDenormMode(true, B, ST, Mode);
3235 
3236   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3237   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3238   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3239   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3240   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3241   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3242 
3243   if (!Mode.allFP32Denormals())
3244     toggleSPDenormMode(false, B, ST, Mode);
3245 
3246   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3247     .addUse(Fma4.getReg(0))
3248     .addUse(Fma1.getReg(0))
3249     .addUse(Fma3.getReg(0))
3250     .addUse(NumeratorScaled.getReg(1))
3251     .setMIFlags(Flags);
3252 
3253   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3254     .addUse(Fmas.getReg(0))
3255     .addUse(RHS)
3256     .addUse(LHS)
3257     .setMIFlags(Flags);
3258 
3259   MI.eraseFromParent();
3260   return true;
3261 }
3262 
3263 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3264                                          MachineRegisterInfo &MRI,
3265                                          MachineIRBuilder &B) const {
3266   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3267     return true;
3268 
3269   Register Res = MI.getOperand(0).getReg();
3270   Register LHS = MI.getOperand(1).getReg();
3271   Register RHS = MI.getOperand(2).getReg();
3272 
3273   uint16_t Flags = MI.getFlags();
3274 
3275   LLT S64 = LLT::scalar(64);
3276   LLT S1 = LLT::scalar(1);
3277 
3278   auto One = B.buildFConstant(S64, 1.0);
3279 
3280   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3281     .addUse(LHS)
3282     .addUse(RHS)
3283     .addImm(0)
3284     .setMIFlags(Flags);
3285 
3286   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3287 
3288   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3289     .addUse(DivScale0.getReg(0))
3290     .setMIFlags(Flags);
3291 
3292   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3293   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3294   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3295 
3296   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297     .addUse(LHS)
3298     .addUse(RHS)
3299     .addImm(1)
3300     .setMIFlags(Flags);
3301 
3302   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3303   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3304   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3305 
3306   Register Scale;
3307   if (!ST.hasUsableDivScaleConditionOutput()) {
3308     // Workaround a hardware bug on SI where the condition output from div_scale
3309     // is not usable.
3310 
3311     LLT S32 = LLT::scalar(32);
3312 
3313     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3314     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3315     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3316     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3317 
3318     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3319                               Scale1Unmerge.getReg(1));
3320     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3321                               Scale0Unmerge.getReg(1));
3322     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3323   } else {
3324     Scale = DivScale1.getReg(1);
3325   }
3326 
3327   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3328     .addUse(Fma4.getReg(0))
3329     .addUse(Fma3.getReg(0))
3330     .addUse(Mul.getReg(0))
3331     .addUse(Scale)
3332     .setMIFlags(Flags);
3333 
3334   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3335     .addUse(Fmas.getReg(0))
3336     .addUse(RHS)
3337     .addUse(LHS)
3338     .setMIFlags(Flags);
3339 
3340   MI.eraseFromParent();
3341   return true;
3342 }
3343 
3344 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3345                                                  MachineRegisterInfo &MRI,
3346                                                  MachineIRBuilder &B) const {
3347   Register Res = MI.getOperand(0).getReg();
3348   Register LHS = MI.getOperand(2).getReg();
3349   Register RHS = MI.getOperand(3).getReg();
3350   uint16_t Flags = MI.getFlags();
3351 
3352   LLT S32 = LLT::scalar(32);
3353   LLT S1 = LLT::scalar(1);
3354 
3355   auto Abs = B.buildFAbs(S32, RHS, Flags);
3356   const APFloat C0Val(1.0f);
3357 
3358   auto C0 = B.buildConstant(S32, 0x6f800000);
3359   auto C1 = B.buildConstant(S32, 0x2f800000);
3360   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3361 
3362   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3363   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3364 
3365   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3366 
3367   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3368     .addUse(Mul0.getReg(0))
3369     .setMIFlags(Flags);
3370 
3371   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3372 
3373   B.buildFMul(Res, Sel, Mul1, Flags);
3374 
3375   MI.eraseFromParent();
3376   return true;
3377 }
3378 
3379 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3380 // FIXME: Why do we handle this one but not other removed instructions?
3381 //
3382 // Reciprocal square root.  The clamp prevents infinite results, clamping
3383 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3384 // +-max_float.
3385 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3386                                                     MachineRegisterInfo &MRI,
3387                                                     MachineIRBuilder &B) const {
3388   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3389     return true;
3390 
3391   Register Dst = MI.getOperand(0).getReg();
3392   Register Src = MI.getOperand(2).getReg();
3393   auto Flags = MI.getFlags();
3394 
3395   LLT Ty = MRI.getType(Dst);
3396 
3397   const fltSemantics *FltSemantics;
3398   if (Ty == LLT::scalar(32))
3399     FltSemantics = &APFloat::IEEEsingle();
3400   else if (Ty == LLT::scalar(64))
3401     FltSemantics = &APFloat::IEEEdouble();
3402   else
3403     return false;
3404 
3405   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3406     .addUse(Src)
3407     .setMIFlags(Flags);
3408 
3409   // We don't need to concern ourselves with the snan handling difference, since
3410   // the rsq quieted (or not) so use the one which will directly select.
3411   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3412   const bool UseIEEE = MFI->getMode().IEEE;
3413 
3414   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3415   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3416                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3417 
3418   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3419 
3420   if (UseIEEE)
3421     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3422   else
3423     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3424   MI.eraseFromParent();
3425   return true;
3426 }
3427 
3428 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3429   switch (IID) {
3430   case Intrinsic::amdgcn_ds_fadd:
3431     return AMDGPU::G_ATOMICRMW_FADD;
3432   case Intrinsic::amdgcn_ds_fmin:
3433     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3434   case Intrinsic::amdgcn_ds_fmax:
3435     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3436   default:
3437     llvm_unreachable("not a DS FP intrinsic");
3438   }
3439 }
3440 
3441 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3442                                                       MachineInstr &MI,
3443                                                       Intrinsic::ID IID) const {
3444   GISelChangeObserver &Observer = Helper.Observer;
3445   Observer.changingInstr(MI);
3446 
3447   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3448 
3449   // The remaining operands were used to set fields in the MemOperand on
3450   // construction.
3451   for (int I = 6; I > 3; --I)
3452     MI.RemoveOperand(I);
3453 
3454   MI.RemoveOperand(1); // Remove the intrinsic ID.
3455   Observer.changedInstr(MI);
3456   return true;
3457 }
3458 
3459 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3460                                             MachineRegisterInfo &MRI,
3461                                             MachineIRBuilder &B) const {
3462   uint64_t Offset =
3463     ST.getTargetLowering()->getImplicitParameterOffset(
3464       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3465   LLT DstTy = MRI.getType(DstReg);
3466   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3467 
3468   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3469   if (!loadInputValue(KernargPtrReg, B,
3470                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3471     return false;
3472 
3473   // FIXME: This should be nuw
3474   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3475   return true;
3476 }
3477 
3478 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3479                                                  MachineRegisterInfo &MRI,
3480                                                  MachineIRBuilder &B) const {
3481   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3482   if (!MFI->isEntryFunction()) {
3483     return legalizePreloadedArgIntrin(MI, MRI, B,
3484                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3485   }
3486 
3487   Register DstReg = MI.getOperand(0).getReg();
3488   if (!getImplicitArgPtr(DstReg, MRI, B))
3489     return false;
3490 
3491   MI.eraseFromParent();
3492   return true;
3493 }
3494 
3495 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3496                                               MachineRegisterInfo &MRI,
3497                                               MachineIRBuilder &B,
3498                                               unsigned AddrSpace) const {
3499   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3500   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3501   Register Hi32 = Unmerge.getReg(1);
3502 
3503   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3504   MI.eraseFromParent();
3505   return true;
3506 }
3507 
3508 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3509 // offset (the offset that is included in bounds checking and swizzling, to be
3510 // split between the instruction's voffset and immoffset fields) and soffset
3511 // (the offset that is excluded from bounds checking and swizzling, to go in
3512 // the instruction's soffset field).  This function takes the first kind of
3513 // offset and figures out how to split it between voffset and immoffset.
3514 std::tuple<Register, unsigned, unsigned>
3515 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3516                                         Register OrigOffset) const {
3517   const unsigned MaxImm = 4095;
3518   Register BaseReg;
3519   unsigned TotalConstOffset;
3520   const LLT S32 = LLT::scalar(32);
3521   MachineRegisterInfo &MRI = *B.getMRI();
3522 
3523   std::tie(BaseReg, TotalConstOffset) =
3524       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3525 
3526   unsigned ImmOffset = TotalConstOffset;
3527 
3528   // If BaseReg is a pointer, convert it to int.
3529   if (MRI.getType(BaseReg).isPointer())
3530     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3531 
3532   // If the immediate value is too big for the immoffset field, put the value
3533   // and -4096 into the immoffset field so that the value that is copied/added
3534   // for the voffset field is a multiple of 4096, and it stands more chance
3535   // of being CSEd with the copy/add for another similar load/store.
3536   // However, do not do that rounding down to a multiple of 4096 if that is a
3537   // negative number, as it appears to be illegal to have a negative offset
3538   // in the vgpr, even if adding the immediate offset makes it positive.
3539   unsigned Overflow = ImmOffset & ~MaxImm;
3540   ImmOffset -= Overflow;
3541   if ((int32_t)Overflow < 0) {
3542     Overflow += ImmOffset;
3543     ImmOffset = 0;
3544   }
3545 
3546   if (Overflow != 0) {
3547     if (!BaseReg) {
3548       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3549     } else {
3550       auto OverflowVal = B.buildConstant(S32, Overflow);
3551       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3552     }
3553   }
3554 
3555   if (!BaseReg)
3556     BaseReg = B.buildConstant(S32, 0).getReg(0);
3557 
3558   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3559 }
3560 
3561 /// Handle register layout difference for f16 images for some subtargets.
3562 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3563                                              MachineRegisterInfo &MRI,
3564                                              Register Reg,
3565                                              bool ImageStore) const {
3566   const LLT S16 = LLT::scalar(16);
3567   const LLT S32 = LLT::scalar(32);
3568   LLT StoreVT = MRI.getType(Reg);
3569   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3570 
3571   if (ST.hasUnpackedD16VMem()) {
3572     auto Unmerge = B.buildUnmerge(S16, Reg);
3573 
3574     SmallVector<Register, 4> WideRegs;
3575     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3576       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3577 
3578     int NumElts = StoreVT.getNumElements();
3579 
3580     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3581   }
3582 
3583   if (ImageStore && ST.hasImageStoreD16Bug()) {
3584     if (StoreVT.getNumElements() == 2) {
3585       SmallVector<Register, 4> PackedRegs;
3586       Reg = B.buildBitcast(S32, Reg).getReg(0);
3587       PackedRegs.push_back(Reg);
3588       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3589       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3590     }
3591 
3592     if (StoreVT.getNumElements() == 3) {
3593       SmallVector<Register, 4> PackedRegs;
3594       auto Unmerge = B.buildUnmerge(S16, Reg);
3595       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3596         PackedRegs.push_back(Unmerge.getReg(I));
3597       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3598       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3599       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3600     }
3601 
3602     if (StoreVT.getNumElements() == 4) {
3603       SmallVector<Register, 4> PackedRegs;
3604       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3605       auto Unmerge = B.buildUnmerge(S32, Reg);
3606       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3607         PackedRegs.push_back(Unmerge.getReg(I));
3608       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3609       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3610     }
3611 
3612     llvm_unreachable("invalid data type");
3613   }
3614 
3615   return Reg;
3616 }
3617 
3618 Register AMDGPULegalizerInfo::fixStoreSourceType(
3619   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3620   MachineRegisterInfo *MRI = B.getMRI();
3621   LLT Ty = MRI->getType(VData);
3622 
3623   const LLT S16 = LLT::scalar(16);
3624 
3625   // Fixup illegal register types for i8 stores.
3626   if (Ty == LLT::scalar(8) || Ty == S16) {
3627     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3628     return AnyExt;
3629   }
3630 
3631   if (Ty.isVector()) {
3632     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3633       if (IsFormat)
3634         return handleD16VData(B, *MRI, VData);
3635     }
3636   }
3637 
3638   return VData;
3639 }
3640 
3641 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3642                                               MachineRegisterInfo &MRI,
3643                                               MachineIRBuilder &B,
3644                                               bool IsTyped,
3645                                               bool IsFormat) const {
3646   Register VData = MI.getOperand(1).getReg();
3647   LLT Ty = MRI.getType(VData);
3648   LLT EltTy = Ty.getScalarType();
3649   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3650   const LLT S32 = LLT::scalar(32);
3651 
3652   VData = fixStoreSourceType(B, VData, IsFormat);
3653   Register RSrc = MI.getOperand(2).getReg();
3654 
3655   MachineMemOperand *MMO = *MI.memoperands_begin();
3656   const int MemSize = MMO->getSize();
3657 
3658   unsigned ImmOffset;
3659   unsigned TotalOffset;
3660 
3661   // The typed intrinsics add an immediate after the registers.
3662   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3663 
3664   // The struct intrinsic variants add one additional operand over raw.
3665   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3666   Register VIndex;
3667   int OpOffset = 0;
3668   if (HasVIndex) {
3669     VIndex = MI.getOperand(3).getReg();
3670     OpOffset = 1;
3671   }
3672 
3673   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3674   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3675 
3676   unsigned Format = 0;
3677   if (IsTyped) {
3678     Format = MI.getOperand(5 + OpOffset).getImm();
3679     ++OpOffset;
3680   }
3681 
3682   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3683 
3684   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3685   if (TotalOffset != 0)
3686     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3687 
3688   unsigned Opc;
3689   if (IsTyped) {
3690     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3691                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3692   } else if (IsFormat) {
3693     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3694                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3695   } else {
3696     switch (MemSize) {
3697     case 1:
3698       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3699       break;
3700     case 2:
3701       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3702       break;
3703     default:
3704       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3705       break;
3706     }
3707   }
3708 
3709   if (!VIndex)
3710     VIndex = B.buildConstant(S32, 0).getReg(0);
3711 
3712   auto MIB = B.buildInstr(Opc)
3713     .addUse(VData)              // vdata
3714     .addUse(RSrc)               // rsrc
3715     .addUse(VIndex)             // vindex
3716     .addUse(VOffset)            // voffset
3717     .addUse(SOffset)            // soffset
3718     .addImm(ImmOffset);         // offset(imm)
3719 
3720   if (IsTyped)
3721     MIB.addImm(Format);
3722 
3723   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3724      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3725      .addMemOperand(MMO);
3726 
3727   MI.eraseFromParent();
3728   return true;
3729 }
3730 
3731 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3732                                              MachineRegisterInfo &MRI,
3733                                              MachineIRBuilder &B,
3734                                              bool IsFormat,
3735                                              bool IsTyped) const {
3736   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3737   MachineMemOperand *MMO = *MI.memoperands_begin();
3738   const int MemSize = MMO->getSize();
3739   const LLT S32 = LLT::scalar(32);
3740 
3741   Register Dst = MI.getOperand(0).getReg();
3742   Register RSrc = MI.getOperand(2).getReg();
3743 
3744   // The typed intrinsics add an immediate after the registers.
3745   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3746 
3747   // The struct intrinsic variants add one additional operand over raw.
3748   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3749   Register VIndex;
3750   int OpOffset = 0;
3751   if (HasVIndex) {
3752     VIndex = MI.getOperand(3).getReg();
3753     OpOffset = 1;
3754   }
3755 
3756   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3757   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3758 
3759   unsigned Format = 0;
3760   if (IsTyped) {
3761     Format = MI.getOperand(5 + OpOffset).getImm();
3762     ++OpOffset;
3763   }
3764 
3765   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3766   unsigned ImmOffset;
3767   unsigned TotalOffset;
3768 
3769   LLT Ty = MRI.getType(Dst);
3770   LLT EltTy = Ty.getScalarType();
3771   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3772   const bool Unpacked = ST.hasUnpackedD16VMem();
3773 
3774   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3775   if (TotalOffset != 0)
3776     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3777 
3778   unsigned Opc;
3779 
3780   if (IsTyped) {
3781     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3782                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3783   } else if (IsFormat) {
3784     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3785                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3786   } else {
3787     switch (MemSize) {
3788     case 1:
3789       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3790       break;
3791     case 2:
3792       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3793       break;
3794     default:
3795       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3796       break;
3797     }
3798   }
3799 
3800   Register LoadDstReg;
3801 
3802   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3803   LLT UnpackedTy = Ty.changeElementSize(32);
3804 
3805   if (IsExtLoad)
3806     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3807   else if (Unpacked && IsD16 && Ty.isVector())
3808     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3809   else
3810     LoadDstReg = Dst;
3811 
3812   if (!VIndex)
3813     VIndex = B.buildConstant(S32, 0).getReg(0);
3814 
3815   auto MIB = B.buildInstr(Opc)
3816     .addDef(LoadDstReg)         // vdata
3817     .addUse(RSrc)               // rsrc
3818     .addUse(VIndex)             // vindex
3819     .addUse(VOffset)            // voffset
3820     .addUse(SOffset)            // soffset
3821     .addImm(ImmOffset);         // offset(imm)
3822 
3823   if (IsTyped)
3824     MIB.addImm(Format);
3825 
3826   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3827      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3828      .addMemOperand(MMO);
3829 
3830   if (LoadDstReg != Dst) {
3831     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3832 
3833     // Widen result for extending loads was widened.
3834     if (IsExtLoad)
3835       B.buildTrunc(Dst, LoadDstReg);
3836     else {
3837       // Repack to original 16-bit vector result
3838       // FIXME: G_TRUNC should work, but legalization currently fails
3839       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3840       SmallVector<Register, 4> Repack;
3841       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3842         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3843       B.buildMerge(Dst, Repack);
3844     }
3845   }
3846 
3847   MI.eraseFromParent();
3848   return true;
3849 }
3850 
3851 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3852                                                MachineIRBuilder &B,
3853                                                bool IsInc) const {
3854   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3855                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3856   B.buildInstr(Opc)
3857     .addDef(MI.getOperand(0).getReg())
3858     .addUse(MI.getOperand(2).getReg())
3859     .addUse(MI.getOperand(3).getReg())
3860     .cloneMemRefs(MI);
3861   MI.eraseFromParent();
3862   return true;
3863 }
3864 
3865 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3866   switch (IntrID) {
3867   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3868   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3869     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3870   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3871   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3872     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3873   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3874   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3875     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3876   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3877   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3878     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3879   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3880   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3881     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3882   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3883   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3884     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3885   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3886   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3887     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3888   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3889   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3890     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3891   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3892   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3893     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3894   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3895   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3896     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3897   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3898   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3899     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3900   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3901   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3902     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3903   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3904   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3905     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3906   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3907   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3908     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3909   default:
3910     llvm_unreachable("unhandled atomic opcode");
3911   }
3912 }
3913 
3914 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3915                                                MachineIRBuilder &B,
3916                                                Intrinsic::ID IID) const {
3917   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3918                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3919   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3920 
3921   Register Dst;
3922 
3923   int OpOffset = 0;
3924   if (HasReturn) {
3925     // A few FP atomics do not support return values.
3926     Dst = MI.getOperand(0).getReg();
3927   } else {
3928     OpOffset = -1;
3929   }
3930 
3931   Register VData = MI.getOperand(2 + OpOffset).getReg();
3932   Register CmpVal;
3933 
3934   if (IsCmpSwap) {
3935     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3936     ++OpOffset;
3937   }
3938 
3939   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3940   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3941 
3942   // The struct intrinsic variants add one additional operand over raw.
3943   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3944   Register VIndex;
3945   if (HasVIndex) {
3946     VIndex = MI.getOperand(4 + OpOffset).getReg();
3947     ++OpOffset;
3948   }
3949 
3950   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3951   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3952   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3953 
3954   MachineMemOperand *MMO = *MI.memoperands_begin();
3955 
3956   unsigned ImmOffset;
3957   unsigned TotalOffset;
3958   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3959   if (TotalOffset != 0)
3960     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3961 
3962   if (!VIndex)
3963     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3964 
3965   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3966 
3967   if (HasReturn)
3968     MIB.addDef(Dst);
3969 
3970   MIB.addUse(VData); // vdata
3971 
3972   if (IsCmpSwap)
3973     MIB.addReg(CmpVal);
3974 
3975   MIB.addUse(RSrc)               // rsrc
3976      .addUse(VIndex)             // vindex
3977      .addUse(VOffset)            // voffset
3978      .addUse(SOffset)            // soffset
3979      .addImm(ImmOffset)          // offset(imm)
3980      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3981      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3982      .addMemOperand(MMO);
3983 
3984   MI.eraseFromParent();
3985   return true;
3986 }
3987 
3988 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3989 /// vector with s16 typed elements.
3990 static void packImageA16AddressToDwords(
3991     MachineIRBuilder &B, MachineInstr &MI,
3992     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3993     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3994   const LLT S16 = LLT::scalar(16);
3995   const LLT V2S16 = LLT::vector(2, 16);
3996 
3997   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3998     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3999     if (!SrcOp.isReg())
4000       continue; // _L to _LZ may have eliminated this.
4001 
4002     Register AddrReg = SrcOp.getReg();
4003 
4004     if (I < Intr->GradientStart) {
4005       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4006       PackedAddrs.push_back(AddrReg);
4007     } else {
4008       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4009       // derivatives dx/dh and dx/dv are packed with undef.
4010       if (((I + 1) >= EndIdx) ||
4011           ((Intr->NumGradients / 2) % 2 == 1 &&
4012            (I == static_cast<unsigned>(Intr->GradientStart +
4013                                        (Intr->NumGradients / 2) - 1) ||
4014             I == static_cast<unsigned>(Intr->GradientStart +
4015                                        Intr->NumGradients - 1))) ||
4016           // Check for _L to _LZ optimization
4017           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4018         PackedAddrs.push_back(
4019             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4020                 .getReg(0));
4021       } else {
4022         PackedAddrs.push_back(
4023             B.buildBuildVector(
4024                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4025                 .getReg(0));
4026         ++I;
4027       }
4028     }
4029   }
4030 }
4031 
4032 /// Convert from separate vaddr components to a single vector address register,
4033 /// and replace the remaining operands with $noreg.
4034 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4035                                      int DimIdx, int NumVAddrs) {
4036   const LLT S32 = LLT::scalar(32);
4037 
4038   SmallVector<Register, 8> AddrRegs;
4039   for (int I = 0; I != NumVAddrs; ++I) {
4040     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4041     if (SrcOp.isReg()) {
4042       AddrRegs.push_back(SrcOp.getReg());
4043       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4044     }
4045   }
4046 
4047   int NumAddrRegs = AddrRegs.size();
4048   if (NumAddrRegs != 1) {
4049     // Round up to 8 elements for v5-v7
4050     // FIXME: Missing intermediate sized register classes and instructions.
4051     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4052       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4053       auto Undef = B.buildUndef(S32);
4054       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4055       NumAddrRegs = RoundedNumRegs;
4056     }
4057 
4058     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4059     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4060   }
4061 
4062   for (int I = 1; I != NumVAddrs; ++I) {
4063     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4064     if (SrcOp.isReg())
4065       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4066   }
4067 }
4068 
4069 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4070 ///
4071 /// Depending on the subtarget, load/store with 16-bit element data need to be
4072 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4073 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4074 /// registers.
4075 ///
4076 /// We don't want to directly select image instructions just yet, but also want
4077 /// to exposes all register repacking to the legalizer/combiners. We also don't
4078 /// want a selected instrution entering RegBankSelect. In order to avoid
4079 /// defining a multitude of intermediate image instructions, directly hack on
4080 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4081 /// now unnecessary arguments with $noreg.
4082 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4083     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4084     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4085 
4086   const unsigned NumDefs = MI.getNumExplicitDefs();
4087   const unsigned ArgOffset = NumDefs + 1;
4088   bool IsTFE = NumDefs == 2;
4089   // We are only processing the operands of d16 image operations on subtargets
4090   // that use the unpacked register layout, or need to repack the TFE result.
4091 
4092   // TODO: Do we need to guard against already legalized intrinsics?
4093   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4094       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4095 
4096   MachineRegisterInfo *MRI = B.getMRI();
4097   const LLT S32 = LLT::scalar(32);
4098   const LLT S16 = LLT::scalar(16);
4099   const LLT V2S16 = LLT::vector(2, 16);
4100 
4101   unsigned DMask = 0;
4102 
4103   // Check for 16 bit addresses and pack if true.
4104   LLT GradTy =
4105       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4106   LLT AddrTy =
4107       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4108   const bool IsG16 = GradTy == S16;
4109   const bool IsA16 = AddrTy == S16;
4110 
4111   int DMaskLanes = 0;
4112   if (!BaseOpcode->Atomic) {
4113     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4114     if (BaseOpcode->Gather4) {
4115       DMaskLanes = 4;
4116     } else if (DMask != 0) {
4117       DMaskLanes = countPopulation(DMask);
4118     } else if (!IsTFE && !BaseOpcode->Store) {
4119       // If dmask is 0, this is a no-op load. This can be eliminated.
4120       B.buildUndef(MI.getOperand(0));
4121       MI.eraseFromParent();
4122       return true;
4123     }
4124   }
4125 
4126   Observer.changingInstr(MI);
4127   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4128 
4129   unsigned NewOpcode = NumDefs == 0 ?
4130     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4131 
4132   // Track that we legalized this
4133   MI.setDesc(B.getTII().get(NewOpcode));
4134 
4135   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4136   // dmask to be at least 1 otherwise the instruction will fail
4137   if (IsTFE && DMask == 0) {
4138     DMask = 0x1;
4139     DMaskLanes = 1;
4140     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4141   }
4142 
4143   if (BaseOpcode->Atomic) {
4144     Register VData0 = MI.getOperand(2).getReg();
4145     LLT Ty = MRI->getType(VData0);
4146 
4147     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4148     if (Ty.isVector())
4149       return false;
4150 
4151     if (BaseOpcode->AtomicX2) {
4152       Register VData1 = MI.getOperand(3).getReg();
4153       // The two values are packed in one register.
4154       LLT PackedTy = LLT::vector(2, Ty);
4155       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4156       MI.getOperand(2).setReg(Concat.getReg(0));
4157       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4158     }
4159   }
4160 
4161   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4162 
4163   // Optimize _L to _LZ when _L is zero
4164   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4165           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4166     const ConstantFP *ConstantLod;
4167 
4168     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4169                  m_GFCst(ConstantLod))) {
4170       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4171         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4172         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4173             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4174                                                       Intr->Dim);
4175 
4176         // The starting indexes should remain in the same place.
4177         --CorrectedNumVAddrs;
4178 
4179         MI.getOperand(MI.getNumExplicitDefs())
4180             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4181         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4182         Intr = NewImageDimIntr;
4183       }
4184     }
4185   }
4186 
4187   // Optimize _mip away, when 'lod' is zero
4188   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4189     int64_t ConstantLod;
4190     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4191                  m_ICst(ConstantLod))) {
4192       if (ConstantLod == 0) {
4193         // TODO: Change intrinsic opcode and remove operand instead or replacing
4194         // it with 0, as the _L to _LZ handling is done above.
4195         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4196         --CorrectedNumVAddrs;
4197       }
4198     }
4199   }
4200 
4201   // Rewrite the addressing register layout before doing anything else.
4202   if (IsA16 || IsG16) {
4203     if (IsA16) {
4204       // Target must support the feature and gradients need to be 16 bit too
4205       if (!ST.hasA16() || !IsG16)
4206         return false;
4207     } else if (!ST.hasG16())
4208       return false;
4209 
4210     if (Intr->NumVAddrs > 1) {
4211       SmallVector<Register, 4> PackedRegs;
4212       // Don't compress addresses for G16
4213       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4214       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4215                                   PackEndIdx);
4216 
4217       if (!IsA16) {
4218         // Add uncompressed address
4219         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4220           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4221           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4222           PackedRegs.push_back(AddrReg);
4223         }
4224       }
4225 
4226       // See also below in the non-a16 branch
4227       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4228 
4229       if (!UseNSA && PackedRegs.size() > 1) {
4230         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4231         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4232         PackedRegs[0] = Concat.getReg(0);
4233         PackedRegs.resize(1);
4234       }
4235 
4236       const unsigned NumPacked = PackedRegs.size();
4237       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4238         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4239         if (!SrcOp.isReg()) {
4240           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4241           continue;
4242         }
4243 
4244         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4245 
4246         if (I - Intr->VAddrStart < NumPacked)
4247           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4248         else
4249           SrcOp.setReg(AMDGPU::NoRegister);
4250       }
4251     }
4252   } else {
4253     // If the register allocator cannot place the address registers contiguously
4254     // without introducing moves, then using the non-sequential address encoding
4255     // is always preferable, since it saves VALU instructions and is usually a
4256     // wash in terms of code size or even better.
4257     //
4258     // However, we currently have no way of hinting to the register allocator
4259     // that MIMG addresses should be placed contiguously when it is possible to
4260     // do so, so force non-NSA for the common 2-address case as a heuristic.
4261     //
4262     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4263     // allocation when possible.
4264     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4265 
4266     if (!UseNSA && Intr->NumVAddrs > 1)
4267       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4268                                Intr->NumVAddrs);
4269   }
4270 
4271   int Flags = 0;
4272   if (IsA16)
4273     Flags |= 1;
4274   if (IsG16)
4275     Flags |= 2;
4276   MI.addOperand(MachineOperand::CreateImm(Flags));
4277 
4278   if (BaseOpcode->Store) { // No TFE for stores?
4279     // TODO: Handle dmask trim
4280     Register VData = MI.getOperand(1).getReg();
4281     LLT Ty = MRI->getType(VData);
4282     if (!Ty.isVector() || Ty.getElementType() != S16)
4283       return true;
4284 
4285     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4286     if (RepackedReg != VData) {
4287       MI.getOperand(1).setReg(RepackedReg);
4288     }
4289 
4290     return true;
4291   }
4292 
4293   Register DstReg = MI.getOperand(0).getReg();
4294   LLT Ty = MRI->getType(DstReg);
4295   const LLT EltTy = Ty.getScalarType();
4296   const bool IsD16 = Ty.getScalarType() == S16;
4297   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4298 
4299   // Confirm that the return type is large enough for the dmask specified
4300   if (NumElts < DMaskLanes)
4301     return false;
4302 
4303   if (NumElts > 4 || DMaskLanes > 4)
4304     return false;
4305 
4306   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4307   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4308 
4309   // The raw dword aligned data component of the load. The only legal cases
4310   // where this matters should be when using the packed D16 format, for
4311   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4312   LLT RoundedTy;
4313 
4314   // S32 vector to to cover all data, plus TFE result element.
4315   LLT TFETy;
4316 
4317   // Register type to use for each loaded component. Will be S32 or V2S16.
4318   LLT RegTy;
4319 
4320   if (IsD16 && ST.hasUnpackedD16VMem()) {
4321     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4322     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4323     RegTy = S32;
4324   } else {
4325     unsigned EltSize = EltTy.getSizeInBits();
4326     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4327     unsigned RoundedSize = 32 * RoundedElts;
4328     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4329     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4330     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4331   }
4332 
4333   // The return type does not need adjustment.
4334   // TODO: Should we change s16 case to s32 or <2 x s16>?
4335   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4336     return true;
4337 
4338   Register Dst1Reg;
4339 
4340   // Insert after the instruction.
4341   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4342 
4343   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4344   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4345   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4346   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4347 
4348   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4349 
4350   MI.getOperand(0).setReg(NewResultReg);
4351 
4352   // In the IR, TFE is supposed to be used with a 2 element struct return
4353   // type. The intruction really returns these two values in one contiguous
4354   // register, with one additional dword beyond the loaded data. Rewrite the
4355   // return type to use a single register result.
4356 
4357   if (IsTFE) {
4358     Dst1Reg = MI.getOperand(1).getReg();
4359     if (MRI->getType(Dst1Reg) != S32)
4360       return false;
4361 
4362     // TODO: Make sure the TFE operand bit is set.
4363     MI.RemoveOperand(1);
4364 
4365     // Handle the easy case that requires no repack instructions.
4366     if (Ty == S32) {
4367       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4368       return true;
4369     }
4370   }
4371 
4372   // Now figure out how to copy the new result register back into the old
4373   // result.
4374   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4375 
4376   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4377 
4378   if (ResultNumRegs == 1) {
4379     assert(!IsTFE);
4380     ResultRegs[0] = NewResultReg;
4381   } else {
4382     // We have to repack into a new vector of some kind.
4383     for (int I = 0; I != NumDataRegs; ++I)
4384       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4385     B.buildUnmerge(ResultRegs, NewResultReg);
4386 
4387     // Drop the final TFE element to get the data part. The TFE result is
4388     // directly written to the right place already.
4389     if (IsTFE)
4390       ResultRegs.resize(NumDataRegs);
4391   }
4392 
4393   // For an s16 scalar result, we form an s32 result with a truncate regardless
4394   // of packed vs. unpacked.
4395   if (IsD16 && !Ty.isVector()) {
4396     B.buildTrunc(DstReg, ResultRegs[0]);
4397     return true;
4398   }
4399 
4400   // Avoid a build/concat_vector of 1 entry.
4401   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4402     B.buildBitcast(DstReg, ResultRegs[0]);
4403     return true;
4404   }
4405 
4406   assert(Ty.isVector());
4407 
4408   if (IsD16) {
4409     // For packed D16 results with TFE enabled, all the data components are
4410     // S32. Cast back to the expected type.
4411     //
4412     // TODO: We don't really need to use load s32 elements. We would only need one
4413     // cast for the TFE result if a multiple of v2s16 was used.
4414     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4415       for (Register &Reg : ResultRegs)
4416         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4417     } else if (ST.hasUnpackedD16VMem()) {
4418       for (Register &Reg : ResultRegs)
4419         Reg = B.buildTrunc(S16, Reg).getReg(0);
4420     }
4421   }
4422 
4423   auto padWithUndef = [&](LLT Ty, int NumElts) {
4424     if (NumElts == 0)
4425       return;
4426     Register Undef = B.buildUndef(Ty).getReg(0);
4427     for (int I = 0; I != NumElts; ++I)
4428       ResultRegs.push_back(Undef);
4429   };
4430 
4431   // Pad out any elements eliminated due to the dmask.
4432   LLT ResTy = MRI->getType(ResultRegs[0]);
4433   if (!ResTy.isVector()) {
4434     padWithUndef(ResTy, NumElts - ResultRegs.size());
4435     B.buildBuildVector(DstReg, ResultRegs);
4436     return true;
4437   }
4438 
4439   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4440   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4441 
4442   // Deal with the one annoying legal case.
4443   const LLT V3S16 = LLT::vector(3, 16);
4444   if (Ty == V3S16) {
4445     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4446     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4447     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4448     return true;
4449   }
4450 
4451   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4452   B.buildConcatVectors(DstReg, ResultRegs);
4453   return true;
4454 }
4455 
4456 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4457   LegalizerHelper &Helper, MachineInstr &MI) const {
4458   MachineIRBuilder &B = Helper.MIRBuilder;
4459   GISelChangeObserver &Observer = Helper.Observer;
4460 
4461   Register Dst = MI.getOperand(0).getReg();
4462   LLT Ty = B.getMRI()->getType(Dst);
4463   unsigned Size = Ty.getSizeInBits();
4464   MachineFunction &MF = B.getMF();
4465 
4466   Observer.changingInstr(MI);
4467 
4468   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4469     Ty = getBitcastRegisterType(Ty);
4470     Helper.bitcastDst(MI, Ty, 0);
4471     Dst = MI.getOperand(0).getReg();
4472     B.setInsertPt(B.getMBB(), MI);
4473   }
4474 
4475   // FIXME: We don't really need this intermediate instruction. The intrinsic
4476   // should be fixed to have a memory operand. Since it's readnone, we're not
4477   // allowed to add one.
4478   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4479   MI.RemoveOperand(1); // Remove intrinsic ID
4480 
4481   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4482   // TODO: Should this use datalayout alignment?
4483   const unsigned MemSize = (Size + 7) / 8;
4484   const Align MemAlign(4);
4485   MachineMemOperand *MMO = MF.getMachineMemOperand(
4486       MachinePointerInfo(),
4487       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4488           MachineMemOperand::MOInvariant,
4489       MemSize, MemAlign);
4490   MI.addMemOperand(MF, MMO);
4491 
4492   // There are no 96-bit result scalar loads, but widening to 128-bit should
4493   // always be legal. We may need to restore this to a 96-bit result if it turns
4494   // out this needs to be converted to a vector load during RegBankSelect.
4495   if (!isPowerOf2_32(Size)) {
4496     if (Ty.isVector())
4497       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4498     else
4499       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4500   }
4501 
4502   Observer.changedInstr(MI);
4503   return true;
4504 }
4505 
4506 // TODO: Move to selection
4507 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4508                                                 MachineRegisterInfo &MRI,
4509                                                 MachineIRBuilder &B) const {
4510   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4511   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4512       !ST.isTrapHandlerEnabled()) {
4513     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4514   } else {
4515     // Pass queue pointer to trap handler as input, and insert trap instruction
4516     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4517     MachineRegisterInfo &MRI = *B.getMRI();
4518 
4519     Register LiveIn =
4520       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4521     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4522       return false;
4523 
4524     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4525     B.buildCopy(SGPR01, LiveIn);
4526     B.buildInstr(AMDGPU::S_TRAP)
4527         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4528         .addReg(SGPR01, RegState::Implicit);
4529   }
4530 
4531   MI.eraseFromParent();
4532   return true;
4533 }
4534 
4535 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4536     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4537   // Is non-HSA path or trap-handler disabled? then, report a warning
4538   // accordingly
4539   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4540       !ST.isTrapHandlerEnabled()) {
4541     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4542                                      "debugtrap handler not supported",
4543                                      MI.getDebugLoc(), DS_Warning);
4544     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4545     Ctx.diagnose(NoTrap);
4546   } else {
4547     // Insert debug-trap instruction
4548     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4549   }
4550 
4551   MI.eraseFromParent();
4552   return true;
4553 }
4554 
4555 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4556                                                MachineIRBuilder &B) const {
4557   MachineRegisterInfo &MRI = *B.getMRI();
4558   const LLT S16 = LLT::scalar(16);
4559   const LLT S32 = LLT::scalar(32);
4560 
4561   Register DstReg = MI.getOperand(0).getReg();
4562   Register NodePtr = MI.getOperand(2).getReg();
4563   Register RayExtent = MI.getOperand(3).getReg();
4564   Register RayOrigin = MI.getOperand(4).getReg();
4565   Register RayDir = MI.getOperand(5).getReg();
4566   Register RayInvDir = MI.getOperand(6).getReg();
4567   Register TDescr = MI.getOperand(7).getReg();
4568 
4569   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4570   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4571   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4572                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4573                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4574                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4575 
4576   SmallVector<Register, 12> Ops;
4577   if (Is64) {
4578     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4579     Ops.push_back(Unmerge.getReg(0));
4580     Ops.push_back(Unmerge.getReg(1));
4581   } else {
4582     Ops.push_back(NodePtr);
4583   }
4584   Ops.push_back(RayExtent);
4585 
4586   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4587     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4588     Ops.push_back(Unmerge.getReg(0));
4589     Ops.push_back(Unmerge.getReg(1));
4590     Ops.push_back(Unmerge.getReg(2));
4591   };
4592 
4593   packLanes(RayOrigin);
4594   if (IsA16) {
4595     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4596     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4597     Register R1 = MRI.createGenericVirtualRegister(S32);
4598     Register R2 = MRI.createGenericVirtualRegister(S32);
4599     Register R3 = MRI.createGenericVirtualRegister(S32);
4600     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4601     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4602     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4603     Ops.push_back(R1);
4604     Ops.push_back(R2);
4605     Ops.push_back(R3);
4606   } else {
4607     packLanes(RayDir);
4608     packLanes(RayInvDir);
4609   }
4610 
4611   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4612     .addDef(DstReg)
4613     .addImm(Opcode);
4614 
4615   for (Register R : Ops) {
4616     MIB.addUse(R);
4617   }
4618 
4619   MIB.addUse(TDescr)
4620      .addImm(IsA16 ? 1 : 0)
4621      .cloneMemRefs(MI);
4622 
4623   MI.eraseFromParent();
4624   return true;
4625 }
4626 
4627 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4628                                             MachineInstr &MI) const {
4629   MachineIRBuilder &B = Helper.MIRBuilder;
4630   MachineRegisterInfo &MRI = *B.getMRI();
4631 
4632   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4633   auto IntrID = MI.getIntrinsicID();
4634   switch (IntrID) {
4635   case Intrinsic::amdgcn_if:
4636   case Intrinsic::amdgcn_else: {
4637     MachineInstr *Br = nullptr;
4638     MachineBasicBlock *UncondBrTarget = nullptr;
4639     bool Negated = false;
4640     if (MachineInstr *BrCond =
4641             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4642       const SIRegisterInfo *TRI
4643         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4644 
4645       Register Def = MI.getOperand(1).getReg();
4646       Register Use = MI.getOperand(3).getReg();
4647 
4648       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4649 
4650       if (Negated)
4651         std::swap(CondBrTarget, UncondBrTarget);
4652 
4653       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4654       if (IntrID == Intrinsic::amdgcn_if) {
4655         B.buildInstr(AMDGPU::SI_IF)
4656           .addDef(Def)
4657           .addUse(Use)
4658           .addMBB(UncondBrTarget);
4659       } else {
4660         B.buildInstr(AMDGPU::SI_ELSE)
4661             .addDef(Def)
4662             .addUse(Use)
4663             .addMBB(UncondBrTarget);
4664       }
4665 
4666       if (Br) {
4667         Br->getOperand(0).setMBB(CondBrTarget);
4668       } else {
4669         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4670         // since we're swapping branch targets it needs to be reinserted.
4671         // FIXME: IRTranslator should probably not do this
4672         B.buildBr(*CondBrTarget);
4673       }
4674 
4675       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4676       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4677       MI.eraseFromParent();
4678       BrCond->eraseFromParent();
4679       return true;
4680     }
4681 
4682     return false;
4683   }
4684   case Intrinsic::amdgcn_loop: {
4685     MachineInstr *Br = nullptr;
4686     MachineBasicBlock *UncondBrTarget = nullptr;
4687     bool Negated = false;
4688     if (MachineInstr *BrCond =
4689             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4690       const SIRegisterInfo *TRI
4691         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4692 
4693       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4694       Register Reg = MI.getOperand(2).getReg();
4695 
4696       if (Negated)
4697         std::swap(CondBrTarget, UncondBrTarget);
4698 
4699       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4700       B.buildInstr(AMDGPU::SI_LOOP)
4701         .addUse(Reg)
4702         .addMBB(UncondBrTarget);
4703 
4704       if (Br)
4705         Br->getOperand(0).setMBB(CondBrTarget);
4706       else
4707         B.buildBr(*CondBrTarget);
4708 
4709       MI.eraseFromParent();
4710       BrCond->eraseFromParent();
4711       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4712       return true;
4713     }
4714 
4715     return false;
4716   }
4717   case Intrinsic::amdgcn_kernarg_segment_ptr:
4718     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4719       // This only makes sense to call in a kernel, so just lower to null.
4720       B.buildConstant(MI.getOperand(0).getReg(), 0);
4721       MI.eraseFromParent();
4722       return true;
4723     }
4724 
4725     return legalizePreloadedArgIntrin(
4726       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4727   case Intrinsic::amdgcn_implicitarg_ptr:
4728     return legalizeImplicitArgPtr(MI, MRI, B);
4729   case Intrinsic::amdgcn_workitem_id_x:
4730     return legalizePreloadedArgIntrin(MI, MRI, B,
4731                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4732   case Intrinsic::amdgcn_workitem_id_y:
4733     return legalizePreloadedArgIntrin(MI, MRI, B,
4734                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4735   case Intrinsic::amdgcn_workitem_id_z:
4736     return legalizePreloadedArgIntrin(MI, MRI, B,
4737                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4738   case Intrinsic::amdgcn_workgroup_id_x:
4739     return legalizePreloadedArgIntrin(MI, MRI, B,
4740                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4741   case Intrinsic::amdgcn_workgroup_id_y:
4742     return legalizePreloadedArgIntrin(MI, MRI, B,
4743                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4744   case Intrinsic::amdgcn_workgroup_id_z:
4745     return legalizePreloadedArgIntrin(MI, MRI, B,
4746                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4747   case Intrinsic::amdgcn_dispatch_ptr:
4748     return legalizePreloadedArgIntrin(MI, MRI, B,
4749                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4750   case Intrinsic::amdgcn_queue_ptr:
4751     return legalizePreloadedArgIntrin(MI, MRI, B,
4752                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4753   case Intrinsic::amdgcn_implicit_buffer_ptr:
4754     return legalizePreloadedArgIntrin(
4755       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4756   case Intrinsic::amdgcn_dispatch_id:
4757     return legalizePreloadedArgIntrin(MI, MRI, B,
4758                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4759   case Intrinsic::amdgcn_fdiv_fast:
4760     return legalizeFDIVFastIntrin(MI, MRI, B);
4761   case Intrinsic::amdgcn_is_shared:
4762     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4763   case Intrinsic::amdgcn_is_private:
4764     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4765   case Intrinsic::amdgcn_wavefrontsize: {
4766     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4767     MI.eraseFromParent();
4768     return true;
4769   }
4770   case Intrinsic::amdgcn_s_buffer_load:
4771     return legalizeSBufferLoad(Helper, MI);
4772   case Intrinsic::amdgcn_raw_buffer_store:
4773   case Intrinsic::amdgcn_struct_buffer_store:
4774     return legalizeBufferStore(MI, MRI, B, false, false);
4775   case Intrinsic::amdgcn_raw_buffer_store_format:
4776   case Intrinsic::amdgcn_struct_buffer_store_format:
4777     return legalizeBufferStore(MI, MRI, B, false, true);
4778   case Intrinsic::amdgcn_raw_tbuffer_store:
4779   case Intrinsic::amdgcn_struct_tbuffer_store:
4780     return legalizeBufferStore(MI, MRI, B, true, true);
4781   case Intrinsic::amdgcn_raw_buffer_load:
4782   case Intrinsic::amdgcn_struct_buffer_load:
4783     return legalizeBufferLoad(MI, MRI, B, false, false);
4784   case Intrinsic::amdgcn_raw_buffer_load_format:
4785   case Intrinsic::amdgcn_struct_buffer_load_format:
4786     return legalizeBufferLoad(MI, MRI, B, true, false);
4787   case Intrinsic::amdgcn_raw_tbuffer_load:
4788   case Intrinsic::amdgcn_struct_tbuffer_load:
4789     return legalizeBufferLoad(MI, MRI, B, true, true);
4790   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4791   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4792   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4793   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4794   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4795   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4796   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4797   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4798   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4799   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4800   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4801   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4802   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4803   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4804   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4805   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4806   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4807   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4808   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4809   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4810   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4811   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4812   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4813   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4814   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4815   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4816   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4817   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4818     return legalizeBufferAtomic(MI, B, IntrID);
4819   case Intrinsic::amdgcn_atomic_inc:
4820     return legalizeAtomicIncDec(MI, B, true);
4821   case Intrinsic::amdgcn_atomic_dec:
4822     return legalizeAtomicIncDec(MI, B, false);
4823   case Intrinsic::trap:
4824     return legalizeTrapIntrinsic(MI, MRI, B);
4825   case Intrinsic::debugtrap:
4826     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4827   case Intrinsic::amdgcn_rsq_clamp:
4828     return legalizeRsqClampIntrinsic(MI, MRI, B);
4829   case Intrinsic::amdgcn_ds_fadd:
4830   case Intrinsic::amdgcn_ds_fmin:
4831   case Intrinsic::amdgcn_ds_fmax:
4832     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4833   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4834     return legalizeBVHIntrinsic(MI, B);
4835   default: {
4836     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4837             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4838       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4839     return true;
4840   }
4841   }
4842 
4843   return true;
4844 }
4845