1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38   "amdgpu-global-isel-new-legality",
39   cl::desc("Use GlobalISel desired legality, rather than try to use"
40            "rules compatible with selection patterns"),
41   cl::init(false),
42   cl::ReallyHidden);
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     if (!Ty.isVector())
67       return false;
68 
69     const LLT EltTy = Ty.getElementType();
70     const unsigned EltSize = EltTy.getSizeInBits();
71     return Ty.getNumElements() % 2 != 0 &&
72            EltSize > 1 && EltSize < 32 &&
73            Ty.getSizeInBits() % 32 != 0;
74   };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78   return [=](const LegalityQuery &Query) {
79     const LLT Ty = Query.Types[TypeIdx];
80     return Ty.getSizeInBits() % 32 == 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130   const unsigned Size = Ty.getSizeInBits();
131 
132   LLT CoercedTy;
133   if (Size <= 32) {
134     // <2 x s8> -> s16
135     // <4 x s8> -> s32
136     return LLT::scalar(Size);
137   }
138 
139   return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143   return [=](const LegalityQuery &Query) {
144     const LLT Ty = Query.Types[TypeIdx];
145     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146   };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT Ty = Query.Types[TypeIdx];
152     unsigned Size = Ty.getSizeInBits();
153     assert(Size % 32 == 0);
154     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155   };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159   return [=](const LegalityQuery &Query) {
160     const LLT QueryTy = Query.Types[TypeIdx];
161     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162   };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166   return [=](const LegalityQuery &Query) {
167     const LLT QueryTy = Query.Types[TypeIdx];
168     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169   };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173   return [=](const LegalityQuery &Query) {
174     const LLT QueryTy = Query.Types[TypeIdx];
175     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176   };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180   return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184   const int EltSize = EltTy.getSizeInBits();
185   return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189   const int EltSize = Ty.getElementType().getSizeInBits();
190   return EltSize == 32 || EltSize == 64 ||
191          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192          EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196   if (!isRegisterSize(Ty.getSizeInBits()))
197     return false;
198 
199   if (Ty.isVector())
200     return isRegisterVectorType(Ty);
201 
202   return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208   return [=](const LegalityQuery &Query) {
209     return isRegisterType(Query.Types[TypeIdx]);
210   };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     const LLT QueryTy = Query.Types[TypeIdx];
216     if (!QueryTy.isVector())
217       return false;
218     const LLT EltTy = QueryTy.getElementType();
219     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220   };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224   return [=](const LegalityQuery &Query) {
225     const LLT Ty = Query.Types[TypeIdx];
226     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228   };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235                                     bool IsLoad) {
236   switch (AS) {
237   case AMDGPUAS::PRIVATE_ADDRESS:
238     // FIXME: Private element size.
239     return ST.enableFlatScratch() ? 128 : 32;
240   case AMDGPUAS::LOCAL_ADDRESS:
241     return ST.useDS128() ? 128 : 64;
242   case AMDGPUAS::GLOBAL_ADDRESS:
243   case AMDGPUAS::CONSTANT_ADDRESS:
244   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245     // Treat constant and global as identical. SMRD loads are sometimes usable for
246     // global loads (ideally constant address space should be eliminated)
247     // depending on the context. Legality cannot be context dependent, but
248     // RegBankSelect can split the load as necessary depending on the pointer
249     // register bank/uniformity and if the memory is invariant or not written in a
250     // kernel.
251     return IsLoad ? 512 : 128;
252   default:
253     // Flat addresses may contextually need to be split to 32-bit parts if they
254     // may alias scratch depending on the subtarget.
255     return 128;
256   }
257 }
258 
259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260                                  const LegalityQuery &Query,
261                                  unsigned Opcode) {
262   const LLT Ty = Query.Types[0];
263 
264   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265   const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267   unsigned RegSize = Ty.getSizeInBits();
268   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270   unsigned AS = Query.Types[1].getAddressSpace();
271 
272   // All of these need to be custom lowered to cast the pointer operand.
273   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274     return false;
275 
276   // TODO: We should be able to widen loads if the alignment is high enough, but
277   // we also need to modify the memory access size.
278 #if 0
279   // Accept widening loads based on alignment.
280   if (IsLoad && MemSize < Size)
281     MemSize = std::max(MemSize, Align);
282 #endif
283 
284   // Only 1-byte and 2-byte to 32-bit extloads are valid.
285   if (MemSize != RegSize && RegSize != 32)
286     return false;
287 
288   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289     return false;
290 
291   switch (MemSize) {
292   case 8:
293   case 16:
294   case 32:
295   case 64:
296   case 128:
297     break;
298   case 96:
299     if (!ST.hasDwordx3LoadStores())
300       return false;
301     break;
302   case 256:
303   case 512:
304     // These may contextually need to be broken down.
305     break;
306   default:
307     return false;
308   }
309 
310   assert(RegSize >= MemSize);
311 
312   if (AlignBits < MemSize) {
313     const SITargetLowering *TLI = ST.getTargetLowering();
314     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315                                                  Align(AlignBits / 8)))
316       return false;
317   }
318 
319   return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327   if (EnableNewLegality)
328     return false;
329 
330   const unsigned Size = Ty.getSizeInBits();
331   if (Size <= 64)
332     return false;
333   if (!Ty.isVector())
334     return true;
335 
336   LLT EltTy = Ty.getElementType();
337   if (EltTy.isPointer())
338     return true;
339 
340   unsigned EltSize = EltTy.getSizeInBits();
341   return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345                              unsigned Opcode) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583     .customFor({S32, S64})
584     .clampScalar(0, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .scalarize(0);
587 
588   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589                    .legalFor({S32})
590                    .maxScalarOrElt(0, S32);
591 
592   if (ST.hasVOP3PInsts()) {
593     Mulh
594       .clampMaxNumElements(0, S8, 2)
595       .lowerFor({V2S8});
596   }
597 
598   Mulh
599     .scalarize(0)
600     .lower();
601 
602   // Report legal for any types we can handle anywhere. For the cases only legal
603   // on the SALU, RegBankSelect will be able to re-legalize.
604   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606     .clampScalar(0, S32, S64)
607     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609     .widenScalarToNextPow2(0)
610     .scalarize(0);
611 
612   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614     .legalFor({{S32, S1}, {S32, S32}})
615     .minScalar(0, S32)
616     // TODO: .scalarize(0)
617     .lower();
618 
619   getActionDefinitionsBuilder(G_BITCAST)
620     // Don't worry about the size constraint.
621     .legalIf(all(isRegisterType(0), isRegisterType(1)))
622     .lower();
623 
624 
625   getActionDefinitionsBuilder(G_CONSTANT)
626     .legalFor({S1, S32, S64, S16, GlobalPtr,
627                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628     .legalIf(isPointer(0))
629     .clampScalar(0, S32, S64)
630     .widenScalarToNextPow2(0);
631 
632   getActionDefinitionsBuilder(G_FCONSTANT)
633     .legalFor({S32, S64, S16})
634     .clampScalar(0, S16, S64);
635 
636   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637       .legalIf(isRegisterType(0))
638       // s1 and s16 are special cases because they have legal operations on
639       // them, but don't really occupy registers in the normal way.
640       .legalFor({S1, S16})
641       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642       .clampScalarOrElt(0, S32, MaxScalar)
643       .widenScalarToNextPow2(0, 32)
644       .clampMaxNumElements(0, S32, 16);
645 
646   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648   // If the amount is divergent, we have to do a wave reduction to get the
649   // maximum value, so this is expanded during RegBankSelect.
650   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651     .legalFor({{PrivatePtr, S32}});
652 
653   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654     .customIf(typeIsNot(0, PrivatePtr));
655 
656   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658   auto &FPOpActions = getActionDefinitionsBuilder(
659     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660     .legalFor({S32, S64});
661   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662     .customFor({S32, S64});
663   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664     .customFor({S32, S64});
665 
666   if (ST.has16BitInsts()) {
667     if (ST.hasVOP3PInsts())
668       FPOpActions.legalFor({S16, V2S16});
669     else
670       FPOpActions.legalFor({S16});
671 
672     TrigActions.customFor({S16});
673     FDIVActions.customFor({S16});
674   }
675 
676   auto &MinNumMaxNum = getActionDefinitionsBuilder({
677       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679   if (ST.hasVOP3PInsts()) {
680     MinNumMaxNum.customFor(FPTypesPK16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .clampMaxNumElements(0, S16, 2)
683       .clampScalar(0, S16, S64)
684       .scalarize(0);
685   } else if (ST.has16BitInsts()) {
686     MinNumMaxNum.customFor(FPTypes16)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else {
690     MinNumMaxNum.customFor(FPTypesBase)
691       .clampScalar(0, S32, S64)
692       .scalarize(0);
693   }
694 
695   if (ST.hasVOP3PInsts())
696     FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698   FPOpActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   TrigActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   FDIVActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   getActionDefinitionsBuilder({G_FNEG, G_FABS})
711     .legalFor(FPTypesPK16)
712     .clampMaxNumElements(0, S16, 2)
713     .scalarize(0)
714     .clampScalar(0, S16, S64);
715 
716   if (ST.has16BitInsts()) {
717     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718       .legalFor({S32, S64, S16})
719       .scalarize(0)
720       .clampScalar(0, S16, S64);
721   } else {
722     getActionDefinitionsBuilder(G_FSQRT)
723       .legalFor({S32, S64})
724       .scalarize(0)
725       .clampScalar(0, S32, S64);
726 
727     if (ST.hasFractBug()) {
728       getActionDefinitionsBuilder(G_FFLOOR)
729         .customFor({S64})
730         .legalFor({S32, S64})
731         .scalarize(0)
732         .clampScalar(0, S32, S64);
733     } else {
734       getActionDefinitionsBuilder(G_FFLOOR)
735         .legalFor({S32, S64})
736         .scalarize(0)
737         .clampScalar(0, S32, S64);
738     }
739   }
740 
741   getActionDefinitionsBuilder(G_FPTRUNC)
742     .legalFor({{S32, S64}, {S16, S32}})
743     .scalarize(0)
744     .lower();
745 
746   getActionDefinitionsBuilder(G_FPEXT)
747     .legalFor({{S64, S32}, {S32, S16}})
748     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749     .scalarize(0);
750 
751   getActionDefinitionsBuilder(G_FSUB)
752       // Use actual fsub instruction
753       .legalFor({S32})
754       // Must use fadd + fneg
755       .lowerFor({S64, S16, V2S16})
756       .scalarize(0)
757       .clampScalar(0, S32, S64);
758 
759   // Whether this is legal depends on the floating point mode for the function.
760   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762     FMad.customFor({S32, S16});
763   else if (ST.hasMadMacF32Insts())
764     FMad.customFor({S32});
765   else if (ST.hasMadF16())
766     FMad.customFor({S16});
767   FMad.scalarize(0)
768       .lower();
769 
770   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771   if (ST.has16BitInsts()) {
772     FRem.customFor({S16, S32, S64});
773   } else {
774     FRem.minScalar(0, S32)
775         .customFor({S32, S64});
776   }
777   FRem.scalarize(0);
778 
779   // TODO: Do we need to clamp maximum bitwidth?
780   getActionDefinitionsBuilder(G_TRUNC)
781     .legalIf(isScalar(0))
782     .legalFor({{V2S16, V2S32}})
783     .clampMaxNumElements(0, S16, 2)
784     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785     // situations (like an invalid implicit use), we don't want to infinite loop
786     // in the legalizer.
787     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788     .alwaysLegal();
789 
790   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792                {S32, S1}, {S64, S1}, {S16, S1}})
793     .scalarize(0)
794     .clampScalar(0, S32, S64)
795     .widenScalarToNextPow2(1, 32);
796 
797   // TODO: Split s1->s64 during regbankselect for VALU.
798   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800     .lowerFor({{S32, S64}})
801     .lowerIf(typeIs(1, S1))
802     .customFor({{S64, S64}});
803   if (ST.has16BitInsts())
804     IToFP.legalFor({{S16, S16}});
805   IToFP.clampScalar(1, S32, S64)
806        .minScalar(0, S32)
807        .scalarize(0)
808        .widenScalarToNextPow2(1);
809 
810   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812     .customFor({{S64, S64}})
813     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814   if (ST.has16BitInsts())
815     FPToI.legalFor({{S16, S16}});
816   else
817     FPToI.minScalar(1, S32);
818 
819   FPToI.minScalar(0, S32)
820        .scalarize(0)
821        .lower();
822 
823   // Lower roundeven into G_FRINT
824   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825     .scalarize(0)
826     .lower();
827 
828   if (ST.has16BitInsts()) {
829     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830       .legalFor({S16, S32, S64})
831       .clampScalar(0, S16, S64)
832       .scalarize(0);
833   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835       .legalFor({S32, S64})
836       .clampScalar(0, S32, S64)
837       .scalarize(0);
838   } else {
839     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840       .legalFor({S32})
841       .customFor({S64})
842       .clampScalar(0, S32, S64)
843       .scalarize(0);
844   }
845 
846   getActionDefinitionsBuilder(G_PTR_ADD)
847     .legalIf(all(isPointer(0), sameSize(0, 1)))
848     .scalarize(0)
849     .scalarSameSizeAs(1, 0);
850 
851   getActionDefinitionsBuilder(G_PTRMASK)
852     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853     .scalarSameSizeAs(1, 0)
854     .scalarize(0);
855 
856   auto &CmpBuilder =
857     getActionDefinitionsBuilder(G_ICMP)
858     // The compare output type differs based on the register bank of the output,
859     // so make both s1 and s32 legal.
860     //
861     // Scalar compares producing output in scc will be promoted to s32, as that
862     // is the allocatable register type that will be needed for the copy from
863     // scc. This will be promoted during RegBankSelect, and we assume something
864     // before that won't try to use s32 result types.
865     //
866     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867     // bank.
868     .legalForCartesianProduct(
869       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870     .legalForCartesianProduct(
871       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872   if (ST.has16BitInsts()) {
873     CmpBuilder.legalFor({{S1, S16}});
874   }
875 
876   CmpBuilder
877     .widenScalarToNextPow2(1)
878     .clampScalar(1, S32, S64)
879     .scalarize(0)
880     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882   getActionDefinitionsBuilder(G_FCMP)
883     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884     .widenScalarToNextPow2(1)
885     .clampScalar(1, S32, S64)
886     .scalarize(0);
887 
888   // FIXME: fpow has a selection pattern that should move to custom lowering.
889   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890   if (ST.has16BitInsts())
891     Exp2Ops.legalFor({S32, S16});
892   else
893     Exp2Ops.legalFor({S32});
894   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895   Exp2Ops.scalarize(0);
896 
897   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898   if (ST.has16BitInsts())
899     ExpOps.customFor({{S32}, {S16}});
900   else
901     ExpOps.customFor({S32});
902   ExpOps.clampScalar(0, MinScalarFPTy, S32)
903         .scalarize(0);
904 
905   getActionDefinitionsBuilder(G_FPOWI)
906     .clampScalar(0, MinScalarFPTy, S32)
907     .lower();
908 
909   // The 64-bit versions produce 32-bit results, but only on the SALU.
910   getActionDefinitionsBuilder(G_CTPOP)
911     .legalFor({{S32, S32}, {S32, S64}})
912     .clampScalar(0, S32, S32)
913     .clampScalar(1, S32, S64)
914     .scalarize(0)
915     .widenScalarToNextPow2(0, 32)
916     .widenScalarToNextPow2(1, 32);
917 
918   // The hardware instructions return a different result on 0 than the generic
919   // instructions expect. The hardware produces -1, but these produce the
920   // bitwidth.
921   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922     .scalarize(0)
923     .clampScalar(0, S32, S32)
924     .clampScalar(1, S32, S64)
925     .widenScalarToNextPow2(0, 32)
926     .widenScalarToNextPow2(1, 32)
927     .lower();
928 
929   // The 64-bit versions produce 32-bit results, but only on the SALU.
930   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931     .legalFor({{S32, S32}, {S32, S64}})
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32)
936     .widenScalarToNextPow2(1, 32);
937 
938   getActionDefinitionsBuilder(G_BITREVERSE)
939     .legalFor({S32})
940     .clampScalar(0, S32, S32)
941     .scalarize(0);
942 
943   if (ST.has16BitInsts()) {
944     getActionDefinitionsBuilder(G_BSWAP)
945       .legalFor({S16, S32, V2S16})
946       .clampMaxNumElements(0, S16, 2)
947       // FIXME: Fixing non-power-of-2 before clamp is workaround for
948       // narrowScalar limitation.
949       .widenScalarToNextPow2(0)
950       .clampScalar(0, S16, S32)
951       .scalarize(0);
952 
953     if (ST.hasVOP3PInsts()) {
954       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955         .legalFor({S32, S16, V2S16})
956         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
957         .clampMaxNumElements(0, S16, 2)
958         .minScalar(0, S16)
959         .widenScalarToNextPow2(0)
960         .scalarize(0)
961         .lower();
962     } else {
963       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964         .legalFor({S32, S16})
965         .widenScalarToNextPow2(0)
966         .minScalar(0, S16)
967         .scalarize(0)
968         .lower();
969     }
970   } else {
971     // TODO: Should have same legality without v_perm_b32
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S32})
974       .lowerIf(scalarNarrowerThan(0, 32))
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .maxScalar(0, S32)
979       .scalarize(0)
980       .lower();
981 
982     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
983       .legalFor({S32})
984       .minScalar(0, S32)
985       .widenScalarToNextPow2(0)
986       .scalarize(0)
987       .lower();
988   }
989 
990   getActionDefinitionsBuilder(G_INTTOPTR)
991     // List the common cases
992     .legalForCartesianProduct(AddrSpaces64, {S64})
993     .legalForCartesianProduct(AddrSpaces32, {S32})
994     .scalarize(0)
995     // Accept any address space as long as the size matches
996     .legalIf(sameSize(0, 1))
997     .widenScalarIf(smallerThan(1, 0),
998       [](const LegalityQuery &Query) {
999         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1000       })
1001     .narrowScalarIf(largerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       });
1005 
1006   getActionDefinitionsBuilder(G_PTRTOINT)
1007     // List the common cases
1008     .legalForCartesianProduct(AddrSpaces64, {S64})
1009     .legalForCartesianProduct(AddrSpaces32, {S32})
1010     .scalarize(0)
1011     // Accept any address space as long as the size matches
1012     .legalIf(sameSize(0, 1))
1013     .widenScalarIf(smallerThan(0, 1),
1014       [](const LegalityQuery &Query) {
1015         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1016       })
1017     .narrowScalarIf(
1018       largerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       });
1022 
1023   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1024     .scalarize(0)
1025     .custom();
1026 
1027   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1028                                     bool IsLoad) -> bool {
1029     const LLT DstTy = Query.Types[0];
1030 
1031     // Split vector extloads.
1032     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034 
1035     if (MemSize < DstTy.getSizeInBits())
1036       MemSize = std::max(MemSize, AlignBits);
1037 
1038     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1039       return true;
1040 
1041     const LLT PtrTy = Query.Types[1];
1042     unsigned AS = PtrTy.getAddressSpace();
1043     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1044       return true;
1045 
1046     // Catch weird sized loads that don't evenly divide into the access sizes
1047     // TODO: May be able to widen depending on alignment etc.
1048     unsigned NumRegs = (MemSize + 31) / 32;
1049     if (NumRegs == 3) {
1050       if (!ST.hasDwordx3LoadStores())
1051         return true;
1052     } else {
1053       // If the alignment allows, these should have been widened.
1054       if (!isPowerOf2_32(NumRegs))
1055         return true;
1056     }
1057 
1058     if (AlignBits < MemSize) {
1059       const SITargetLowering *TLI = ST.getTargetLowering();
1060       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061                                                       Align(AlignBits / 8));
1062     }
1063 
1064     return false;
1065   };
1066 
1067   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1070 
1071   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1072   // LDS
1073   // TODO: Unsupported flat for SI.
1074 
1075   for (unsigned Op : {G_LOAD, G_STORE}) {
1076     const bool IsStore = Op == G_STORE;
1077 
1078     auto &Actions = getActionDefinitionsBuilder(Op);
1079     // Explicitly list some common cases.
1080     // TODO: Does this help compile time at all?
1081     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1083                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1084                                       {S64, GlobalPtr, 64, GlobalAlign32},
1085                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1086                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1087                                       {S32, GlobalPtr, 8, GlobalAlign8},
1088                                       {S32, GlobalPtr, 16, GlobalAlign16},
1089 
1090                                       {S32, LocalPtr, 32, 32},
1091                                       {S64, LocalPtr, 64, 32},
1092                                       {V2S32, LocalPtr, 64, 32},
1093                                       {S32, LocalPtr, 8, 8},
1094                                       {S32, LocalPtr, 16, 16},
1095                                       {V2S16, LocalPtr, 32, 32},
1096 
1097                                       {S32, PrivatePtr, 32, 32},
1098                                       {S32, PrivatePtr, 8, 8},
1099                                       {S32, PrivatePtr, 16, 16},
1100                                       {V2S16, PrivatePtr, 32, 32},
1101 
1102                                       {S32, ConstantPtr, 32, GlobalAlign32},
1103                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1104                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1105                                       {S64, ConstantPtr, 64, GlobalAlign32},
1106                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1107     Actions.legalIf(
1108       [=](const LegalityQuery &Query) -> bool {
1109         return isLoadStoreLegal(ST, Query, Op);
1110       });
1111 
1112     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1113     // 64-bits.
1114     //
1115     // TODO: Should generalize bitcast action into coerce, which will also cover
1116     // inserting addrspacecasts.
1117     Actions.customIf(typeIs(1, Constant32Ptr));
1118 
1119     // Turn any illegal element vectors into something easier to deal
1120     // with. These will ultimately produce 32-bit scalar shifts to extract the
1121     // parts anyway.
1122     //
1123     // For odd 16-bit element vectors, prefer to split those into pieces with
1124     // 16-bit vector parts.
1125     Actions.bitcastIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128                                           Query.MMODescrs[0].SizeInBits);
1129       }, bitcastToRegisterType(0));
1130 
1131     if (!IsStore) {
1132       // Widen suitably aligned loads by loading extra bytes. The standard
1133       // legalization actions can't properly express widening memory operands.
1134       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135         return shouldWidenLoad(ST, Query, G_LOAD);
1136       });
1137     }
1138 
1139     // FIXME: load/store narrowing should be moved to lower action
1140     Actions
1141         .narrowScalarIf(
1142             [=](const LegalityQuery &Query) -> bool {
1143               return !Query.Types[0].isVector() &&
1144                      needToSplitMemOp(Query, Op == G_LOAD);
1145             },
1146             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1147               const LLT DstTy = Query.Types[0];
1148               const LLT PtrTy = Query.Types[1];
1149 
1150               const unsigned DstSize = DstTy.getSizeInBits();
1151               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1152 
1153               // Split extloads.
1154               if (DstSize > MemSize)
1155                 return std::make_pair(0, LLT::scalar(MemSize));
1156 
1157               if (!isPowerOf2_32(DstSize)) {
1158                 // We're probably decomposing an odd sized store. Try to split
1159                 // to the widest type. TODO: Account for alignment. As-is it
1160                 // should be OK, since the new parts will be further legalized.
1161                 unsigned FloorSize = PowerOf2Floor(DstSize);
1162                 return std::make_pair(0, LLT::scalar(FloorSize));
1163               }
1164 
1165               if (DstSize > 32 && (DstSize % 32 != 0)) {
1166                 // FIXME: Need a way to specify non-extload of larger size if
1167                 // suitably aligned.
1168                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1169               }
1170 
1171               unsigned MaxSize = maxSizeForAddrSpace(ST,
1172                                                      PtrTy.getAddressSpace(),
1173                                                      Op == G_LOAD);
1174               if (MemSize > MaxSize)
1175                 return std::make_pair(0, LLT::scalar(MaxSize));
1176 
1177               unsigned Align = Query.MMODescrs[0].AlignInBits;
1178               return std::make_pair(0, LLT::scalar(Align));
1179             })
1180         .fewerElementsIf(
1181             [=](const LegalityQuery &Query) -> bool {
1182               return Query.Types[0].isVector() &&
1183                      needToSplitMemOp(Query, Op == G_LOAD);
1184             },
1185             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1186               const LLT DstTy = Query.Types[0];
1187               const LLT PtrTy = Query.Types[1];
1188 
1189               LLT EltTy = DstTy.getElementType();
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193 
1194               // FIXME: Handle widened to power of 2 results better. This ends
1195               // up scalarizing.
1196               // FIXME: 3 element stores scalarized on SI
1197 
1198               // Split if it's too large for the address space.
1199               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(0,
1217                                       LLT::vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1234               }
1235 
1236               // Need to split because of alignment.
1237               unsigned Align = Query.MMODescrs[0].AlignInBits;
1238               if (EltSize > Align &&
1239                   (EltSize / Align < DstTy.getNumElements())) {
1240                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1241               }
1242 
1243               // May need relegalization for the scalars.
1244               return std::make_pair(0, EltTy);
1245             })
1246     .lowerIfMemSizeNotPow2()
1247     .minScalar(0, S32);
1248 
1249     if (IsStore)
1250       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1251 
1252     Actions
1253         .widenScalarToNextPow2(0)
1254         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255         .lower();
1256   }
1257 
1258   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260                                                   {S32, GlobalPtr, 16, 2 * 8},
1261                                                   {S32, LocalPtr, 8, 8},
1262                                                   {S32, LocalPtr, 16, 16},
1263                                                   {S32, PrivatePtr, 8, 8},
1264                                                   {S32, PrivatePtr, 16, 16},
1265                                                   {S32, ConstantPtr, 8, 8},
1266                                                   {S32, ConstantPtr, 16, 2 * 8}});
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1270   }
1271 
1272   ExtLoads.clampScalar(0, S32, S32)
1273           .widenScalarToNextPow2(0)
1274           .unsupportedIfMemSizeNotPow2()
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   if (ST.hasLDSFPAtomics()) {
1290     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292   }
1293 
1294   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1295   // demarshalling
1296   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298                 {S32, FlatPtr}, {S64, FlatPtr}})
1299     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300                {S32, RegionPtr}, {S64, RegionPtr}});
1301   // TODO: Pointer types, any 32-bit or 64-bit vector
1302 
1303   // Condition should be s32 for scalar, s1 for vector.
1304   getActionDefinitionsBuilder(G_SELECT)
1305     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1306           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1307           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1308     .clampScalar(0, S16, S64)
1309     .scalarize(1)
1310     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1311     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1312     .clampMaxNumElements(0, S32, 2)
1313     .clampMaxNumElements(0, LocalPtr, 2)
1314     .clampMaxNumElements(0, PrivatePtr, 2)
1315     .scalarize(0)
1316     .widenScalarToNextPow2(0)
1317     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1318 
1319   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1320   // be more flexible with the shift amount type.
1321   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1322     .legalFor({{S32, S32}, {S64, S32}});
1323   if (ST.has16BitInsts()) {
1324     if (ST.hasVOP3PInsts()) {
1325       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1326             .clampMaxNumElements(0, S16, 2);
1327     } else
1328       Shifts.legalFor({{S16, S16}});
1329 
1330     // TODO: Support 16-bit shift amounts for all types
1331     Shifts.widenScalarIf(
1332       [=](const LegalityQuery &Query) {
1333         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1334         // 32-bit amount.
1335         const LLT ValTy = Query.Types[0];
1336         const LLT AmountTy = Query.Types[1];
1337         return ValTy.getSizeInBits() <= 16 &&
1338                AmountTy.getSizeInBits() < 16;
1339       }, changeTo(1, S16));
1340     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1341     Shifts.clampScalar(1, S32, S32);
1342     Shifts.clampScalar(0, S16, S64);
1343     Shifts.widenScalarToNextPow2(0, 16);
1344 
1345     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1346       .minScalar(0, S16)
1347       .scalarize(0)
1348       .lower();
1349   } else {
1350     // Make sure we legalize the shift amount type first, as the general
1351     // expansion for the shifted type will produce much worse code if it hasn't
1352     // been truncated already.
1353     Shifts.clampScalar(1, S32, S32);
1354     Shifts.clampScalar(0, S32, S64);
1355     Shifts.widenScalarToNextPow2(0, 32);
1356 
1357     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1358       .minScalar(0, S32)
1359       .scalarize(0)
1360       .lower();
1361   }
1362   Shifts.scalarize(0);
1363 
1364   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1365     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1366     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1367     unsigned IdxTypeIdx = 2;
1368 
1369     getActionDefinitionsBuilder(Op)
1370       .customIf([=](const LegalityQuery &Query) {
1371           const LLT EltTy = Query.Types[EltTypeIdx];
1372           const LLT VecTy = Query.Types[VecTypeIdx];
1373           const LLT IdxTy = Query.Types[IdxTypeIdx];
1374           const unsigned EltSize = EltTy.getSizeInBits();
1375           return (EltSize == 32 || EltSize == 64) &&
1376                   VecTy.getSizeInBits() % 32 == 0 &&
1377                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1378                   IdxTy.getSizeInBits() == 32;
1379         })
1380       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1381                  bitcastToVectorElement32(VecTypeIdx))
1382       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1383       .bitcastIf(
1384         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1385         [=](const LegalityQuery &Query) {
1386           // For > 64-bit element types, try to turn this into a 64-bit
1387           // element vector since we may be able to do better indexing
1388           // if this is scalar. If not, fall back to 32.
1389           const LLT EltTy = Query.Types[EltTypeIdx];
1390           const LLT VecTy = Query.Types[VecTypeIdx];
1391           const unsigned DstEltSize = EltTy.getSizeInBits();
1392           const unsigned VecSize = VecTy.getSizeInBits();
1393 
1394           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395           return std::make_pair(
1396             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1397         })
1398       .clampScalar(EltTypeIdx, S32, S64)
1399       .clampScalar(VecTypeIdx, S32, S64)
1400       .clampScalar(IdxTypeIdx, S32, S32)
1401       .clampMaxNumElements(VecTypeIdx, S32, 32)
1402       // TODO: Clamp elements for 64-bit vectors?
1403       // It should only be necessary with variable indexes.
1404       // As a last resort, lower to the stack
1405       .lower();
1406   }
1407 
1408   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1409     .unsupportedIf([=](const LegalityQuery &Query) {
1410         const LLT &EltTy = Query.Types[1].getElementType();
1411         return Query.Types[0] != EltTy;
1412       });
1413 
1414   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1415     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1416     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1417 
1418     // FIXME: Doesn't handle extract of illegal sizes.
1419     getActionDefinitionsBuilder(Op)
1420       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1421       // FIXME: Multiples of 16 should not be legal.
1422       .legalIf([=](const LegalityQuery &Query) {
1423           const LLT BigTy = Query.Types[BigTyIdx];
1424           const LLT LitTy = Query.Types[LitTyIdx];
1425           return (BigTy.getSizeInBits() % 32 == 0) &&
1426                  (LitTy.getSizeInBits() % 16 == 0);
1427         })
1428       .widenScalarIf(
1429         [=](const LegalityQuery &Query) {
1430           const LLT BigTy = Query.Types[BigTyIdx];
1431           return (BigTy.getScalarSizeInBits() < 16);
1432         },
1433         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1434       .widenScalarIf(
1435         [=](const LegalityQuery &Query) {
1436           const LLT LitTy = Query.Types[LitTyIdx];
1437           return (LitTy.getScalarSizeInBits() < 16);
1438         },
1439         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1440       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1441       .widenScalarToNextPow2(BigTyIdx, 32);
1442 
1443   }
1444 
1445   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1446     .legalForCartesianProduct(AllS32Vectors, {S32})
1447     .legalForCartesianProduct(AllS64Vectors, {S64})
1448     .clampNumElements(0, V16S32, V32S32)
1449     .clampNumElements(0, V2S64, V16S64)
1450     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1451 
1452   if (ST.hasScalarPackInsts()) {
1453     BuildVector
1454       // FIXME: Should probably widen s1 vectors straight to s32
1455       .minScalarOrElt(0, S16)
1456       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1457       .minScalar(1, S32);
1458 
1459     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460       .legalFor({V2S16, S32})
1461       .lower();
1462     BuildVector.minScalarOrElt(0, S32);
1463   } else {
1464     BuildVector.customFor({V2S16, S16});
1465     BuildVector.minScalarOrElt(0, S32);
1466 
1467     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1468       .customFor({V2S16, S32})
1469       .lower();
1470   }
1471 
1472   BuildVector.legalIf(isRegisterType(0));
1473 
1474   // FIXME: Clamp maximum size
1475   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1476     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1477     .clampMaxNumElements(0, S32, 32)
1478     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1479     .clampMaxNumElements(0, S16, 64);
1480 
1481   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1482   // pre-legalize.
1483   if (ST.hasVOP3PInsts()) {
1484     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1485       .customFor({V2S16, V2S16})
1486       .lower();
1487   } else
1488     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1489 
1490   // Merge/Unmerge
1491   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1492     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1493     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1494 
1495     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1496       const LLT Ty = Query.Types[TypeIdx];
1497       if (Ty.isVector()) {
1498         const LLT &EltTy = Ty.getElementType();
1499         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1500           return true;
1501         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1502           return true;
1503       }
1504       return false;
1505     };
1506 
1507     auto &Builder = getActionDefinitionsBuilder(Op)
1508       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1509       .lowerFor({{S16, V2S16}})
1510       .lowerIf([=](const LegalityQuery &Query) {
1511           const LLT BigTy = Query.Types[BigTyIdx];
1512           return BigTy.getSizeInBits() == 32;
1513         })
1514       // Try to widen to s16 first for small types.
1515       // TODO: Only do this on targets with legal s16 shifts
1516       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1517       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1518       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1519       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1520                            elementTypeIs(1, S16)),
1521                        changeTo(1, V2S16))
1522       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1523       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1524       // valid.
1525       .clampScalar(LitTyIdx, S32, S512)
1526       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1527       // Break up vectors with weird elements into scalars
1528       .fewerElementsIf(
1529         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1530         scalarize(0))
1531       .fewerElementsIf(
1532         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1533         scalarize(1))
1534       .clampScalar(BigTyIdx, S32, MaxScalar);
1535 
1536     if (Op == G_MERGE_VALUES) {
1537       Builder.widenScalarIf(
1538         // TODO: Use 16-bit shifts if legal for 8-bit values?
1539         [=](const LegalityQuery &Query) {
1540           const LLT Ty = Query.Types[LitTyIdx];
1541           return Ty.getSizeInBits() < 32;
1542         },
1543         changeTo(LitTyIdx, S32));
1544     }
1545 
1546     Builder.widenScalarIf(
1547       [=](const LegalityQuery &Query) {
1548         const LLT Ty = Query.Types[BigTyIdx];
1549         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1550           Ty.getSizeInBits() % 16 != 0;
1551       },
1552       [=](const LegalityQuery &Query) {
1553         // Pick the next power of 2, or a multiple of 64 over 128.
1554         // Whichever is smaller.
1555         const LLT &Ty = Query.Types[BigTyIdx];
1556         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1557         if (NewSizeInBits >= 256) {
1558           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1559           if (RoundedTo < NewSizeInBits)
1560             NewSizeInBits = RoundedTo;
1561         }
1562         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1563       })
1564       // Any vectors left are the wrong size. Scalarize them.
1565       .scalarize(0)
1566       .scalarize(1);
1567   }
1568 
1569   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1570   // RegBankSelect.
1571   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1572     .legalFor({{S32}, {S64}});
1573 
1574   if (ST.hasVOP3PInsts()) {
1575     SextInReg.lowerFor({{V2S16}})
1576       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1577       // get more vector shift opportunities, since we'll get those when
1578       // expanded.
1579       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1580   } else if (ST.has16BitInsts()) {
1581     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1582   } else {
1583     // Prefer to promote to s32 before lowering if we don't have 16-bit
1584     // shifts. This avoid a lot of intermediate truncate and extend operations.
1585     SextInReg.lowerFor({{S32}, {S64}});
1586   }
1587 
1588   SextInReg
1589     .scalarize(0)
1590     .clampScalar(0, S32, S64)
1591     .lower();
1592 
1593   getActionDefinitionsBuilder(G_FSHR)
1594     .legalFor({{S32, S32}})
1595     .scalarize(0)
1596     .lower();
1597 
1598   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1599     .legalFor({S64});
1600 
1601   getActionDefinitionsBuilder(G_FENCE)
1602     .alwaysLegal();
1603 
1604   getActionDefinitionsBuilder({
1605       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1606       G_FCOPYSIGN,
1607 
1608       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1609       G_ATOMICRMW_NAND,
1610       G_ATOMICRMW_FSUB,
1611       G_READ_REGISTER,
1612       G_WRITE_REGISTER,
1613 
1614       G_SADDO, G_SSUBO,
1615 
1616        // TODO: Implement
1617       G_FMINIMUM, G_FMAXIMUM,
1618       G_FSHL
1619     }).lower();
1620 
1621   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1622         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1624     .unsupported();
1625 
1626   computeTables();
1627   verify(*ST.getInstrInfo());
1628 }
1629 
1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1631                                          MachineInstr &MI) const {
1632   MachineIRBuilder &B = Helper.MIRBuilder;
1633   MachineRegisterInfo &MRI = *B.getMRI();
1634 
1635   switch (MI.getOpcode()) {
1636   case TargetOpcode::G_ADDRSPACE_CAST:
1637     return legalizeAddrSpaceCast(MI, MRI, B);
1638   case TargetOpcode::G_FRINT:
1639     return legalizeFrint(MI, MRI, B);
1640   case TargetOpcode::G_FCEIL:
1641     return legalizeFceil(MI, MRI, B);
1642   case TargetOpcode::G_FREM:
1643     return legalizeFrem(MI, MRI, B);
1644   case TargetOpcode::G_INTRINSIC_TRUNC:
1645     return legalizeIntrinsicTrunc(MI, MRI, B);
1646   case TargetOpcode::G_SITOFP:
1647     return legalizeITOFP(MI, MRI, B, true);
1648   case TargetOpcode::G_UITOFP:
1649     return legalizeITOFP(MI, MRI, B, false);
1650   case TargetOpcode::G_FPTOSI:
1651     return legalizeFPTOI(MI, MRI, B, true);
1652   case TargetOpcode::G_FPTOUI:
1653     return legalizeFPTOI(MI, MRI, B, false);
1654   case TargetOpcode::G_FMINNUM:
1655   case TargetOpcode::G_FMAXNUM:
1656   case TargetOpcode::G_FMINNUM_IEEE:
1657   case TargetOpcode::G_FMAXNUM_IEEE:
1658     return legalizeMinNumMaxNum(Helper, MI);
1659   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1660     return legalizeExtractVectorElt(MI, MRI, B);
1661   case TargetOpcode::G_INSERT_VECTOR_ELT:
1662     return legalizeInsertVectorElt(MI, MRI, B);
1663   case TargetOpcode::G_SHUFFLE_VECTOR:
1664     return legalizeShuffleVector(MI, MRI, B);
1665   case TargetOpcode::G_FSIN:
1666   case TargetOpcode::G_FCOS:
1667     return legalizeSinCos(MI, MRI, B);
1668   case TargetOpcode::G_GLOBAL_VALUE:
1669     return legalizeGlobalValue(MI, MRI, B);
1670   case TargetOpcode::G_LOAD:
1671     return legalizeLoad(Helper, MI);
1672   case TargetOpcode::G_FMAD:
1673     return legalizeFMad(MI, MRI, B);
1674   case TargetOpcode::G_FDIV:
1675     return legalizeFDIV(MI, MRI, B);
1676   case TargetOpcode::G_UDIV:
1677   case TargetOpcode::G_UREM:
1678     return legalizeUDIV_UREM(MI, MRI, B);
1679   case TargetOpcode::G_SDIV:
1680   case TargetOpcode::G_SREM:
1681     return legalizeSDIV_SREM(MI, MRI, B);
1682   case TargetOpcode::G_ATOMIC_CMPXCHG:
1683     return legalizeAtomicCmpXChg(MI, MRI, B);
1684   case TargetOpcode::G_FLOG:
1685     return legalizeFlog(MI, B, numbers::ln2f);
1686   case TargetOpcode::G_FLOG10:
1687     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1688   case TargetOpcode::G_FEXP:
1689     return legalizeFExp(MI, B);
1690   case TargetOpcode::G_FPOW:
1691     return legalizeFPow(MI, B);
1692   case TargetOpcode::G_FFLOOR:
1693     return legalizeFFloor(MI, MRI, B);
1694   case TargetOpcode::G_BUILD_VECTOR:
1695     return legalizeBuildVector(MI, MRI, B);
1696   default:
1697     return false;
1698   }
1699 
1700   llvm_unreachable("expected switch to return");
1701 }
1702 
1703 Register AMDGPULegalizerInfo::getSegmentAperture(
1704   unsigned AS,
1705   MachineRegisterInfo &MRI,
1706   MachineIRBuilder &B) const {
1707   MachineFunction &MF = B.getMF();
1708   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1709   const LLT S32 = LLT::scalar(32);
1710 
1711   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1712 
1713   if (ST.hasApertureRegs()) {
1714     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1715     // getreg.
1716     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1717         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1718         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1719     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1720         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1721         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1722     unsigned Encoding =
1723         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1724         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1725         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1726 
1727     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1728 
1729     B.buildInstr(AMDGPU::S_GETREG_B32)
1730       .addDef(GetReg)
1731       .addImm(Encoding);
1732     MRI.setType(GetReg, S32);
1733 
1734     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1735     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1736   }
1737 
1738   Register QueuePtr = MRI.createGenericVirtualRegister(
1739     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1740 
1741   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1742     return Register();
1743 
1744   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1745   // private_segment_aperture_base_hi.
1746   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1747 
1748   // TODO: can we be smarter about machine pointer info?
1749   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1750   MachineMemOperand *MMO = MF.getMachineMemOperand(
1751       PtrInfo,
1752       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1753           MachineMemOperand::MOInvariant,
1754       4, commonAlignment(Align(64), StructOffset));
1755 
1756   Register LoadAddr;
1757 
1758   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1759   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1760 }
1761 
1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1763   MachineInstr &MI, MachineRegisterInfo &MRI,
1764   MachineIRBuilder &B) const {
1765   MachineFunction &MF = B.getMF();
1766 
1767   const LLT S32 = LLT::scalar(32);
1768   Register Dst = MI.getOperand(0).getReg();
1769   Register Src = MI.getOperand(1).getReg();
1770 
1771   LLT DstTy = MRI.getType(Dst);
1772   LLT SrcTy = MRI.getType(Src);
1773   unsigned DestAS = DstTy.getAddressSpace();
1774   unsigned SrcAS = SrcTy.getAddressSpace();
1775 
1776   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1777   // vector element.
1778   assert(!DstTy.isVector());
1779 
1780   const AMDGPUTargetMachine &TM
1781     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1782 
1783   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1784     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1785     return true;
1786   }
1787 
1788   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1789     // Truncate.
1790     B.buildExtract(Dst, Src, 0);
1791     MI.eraseFromParent();
1792     return true;
1793   }
1794 
1795   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1796     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1797     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1798 
1799     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1800     // another. Merge operands are required to be the same type, but creating an
1801     // extra ptrtoint would be kind of pointless.
1802     auto HighAddr = B.buildConstant(
1803       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1804     B.buildMerge(Dst, {Src, HighAddr});
1805     MI.eraseFromParent();
1806     return true;
1807   }
1808 
1809   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1810     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1811            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1812     unsigned NullVal = TM.getNullPointerValue(DestAS);
1813 
1814     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1815     auto FlatNull = B.buildConstant(SrcTy, 0);
1816 
1817     // Extract low 32-bits of the pointer.
1818     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1819 
1820     auto CmpRes =
1821         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1822     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1823 
1824     MI.eraseFromParent();
1825     return true;
1826   }
1827 
1828   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1829     return false;
1830 
1831   if (!ST.hasFlatAddressSpace())
1832     return false;
1833 
1834   auto SegmentNull =
1835       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1836   auto FlatNull =
1837       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1838 
1839   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1840   if (!ApertureReg.isValid())
1841     return false;
1842 
1843   auto CmpRes =
1844       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1845 
1846   // Coerce the type of the low half of the result so we can use merge_values.
1847   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1848 
1849   // TODO: Should we allow mismatched types but matching sizes in merges to
1850   // avoid the ptrtoint?
1851   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1852   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1853 
1854   MI.eraseFromParent();
1855   return true;
1856 }
1857 
1858 bool AMDGPULegalizerInfo::legalizeFrint(
1859   MachineInstr &MI, MachineRegisterInfo &MRI,
1860   MachineIRBuilder &B) const {
1861   Register Src = MI.getOperand(1).getReg();
1862   LLT Ty = MRI.getType(Src);
1863   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1864 
1865   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1866   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1867 
1868   auto C1 = B.buildFConstant(Ty, C1Val);
1869   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1870 
1871   // TODO: Should this propagate fast-math-flags?
1872   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1873   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1874 
1875   auto C2 = B.buildFConstant(Ty, C2Val);
1876   auto Fabs = B.buildFAbs(Ty, Src);
1877 
1878   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1879   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1880   MI.eraseFromParent();
1881   return true;
1882 }
1883 
1884 bool AMDGPULegalizerInfo::legalizeFceil(
1885   MachineInstr &MI, MachineRegisterInfo &MRI,
1886   MachineIRBuilder &B) const {
1887 
1888   const LLT S1 = LLT::scalar(1);
1889   const LLT S64 = LLT::scalar(64);
1890 
1891   Register Src = MI.getOperand(1).getReg();
1892   assert(MRI.getType(Src) == S64);
1893 
1894   // result = trunc(src)
1895   // if (src > 0.0 && src != result)
1896   //   result += 1.0
1897 
1898   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1899 
1900   const auto Zero = B.buildFConstant(S64, 0.0);
1901   const auto One = B.buildFConstant(S64, 1.0);
1902   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1903   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1904   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1905   auto Add = B.buildSelect(S64, And, One, Zero);
1906 
1907   // TODO: Should this propagate fast-math-flags?
1908   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1909   return true;
1910 }
1911 
1912 bool AMDGPULegalizerInfo::legalizeFrem(
1913   MachineInstr &MI, MachineRegisterInfo &MRI,
1914   MachineIRBuilder &B) const {
1915     Register DstReg = MI.getOperand(0).getReg();
1916     Register Src0Reg = MI.getOperand(1).getReg();
1917     Register Src1Reg = MI.getOperand(2).getReg();
1918     auto Flags = MI.getFlags();
1919     LLT Ty = MRI.getType(DstReg);
1920 
1921     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1923     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1924     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925     MI.eraseFromParent();
1926     return true;
1927 }
1928 
1929 static MachineInstrBuilder extractF64Exponent(Register Hi,
1930                                               MachineIRBuilder &B) {
1931   const unsigned FractBits = 52;
1932   const unsigned ExpBits = 11;
1933   LLT S32 = LLT::scalar(32);
1934 
1935   auto Const0 = B.buildConstant(S32, FractBits - 32);
1936   auto Const1 = B.buildConstant(S32, ExpBits);
1937 
1938   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1939     .addUse(Hi)
1940     .addUse(Const0.getReg(0))
1941     .addUse(Const1.getReg(0));
1942 
1943   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1947   MachineInstr &MI, MachineRegisterInfo &MRI,
1948   MachineIRBuilder &B) const {
1949   const LLT S1 = LLT::scalar(1);
1950   const LLT S32 = LLT::scalar(32);
1951   const LLT S64 = LLT::scalar(64);
1952 
1953   Register Src = MI.getOperand(1).getReg();
1954   assert(MRI.getType(Src) == S64);
1955 
1956   // TODO: Should this use extract since the low half is unused?
1957   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1958   Register Hi = Unmerge.getReg(1);
1959 
1960   // Extract the upper half, since this is where we will find the sign and
1961   // exponent.
1962   auto Exp = extractF64Exponent(Hi, B);
1963 
1964   const unsigned FractBits = 52;
1965 
1966   // Extract the sign bit.
1967   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1968   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1969 
1970   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1971 
1972   const auto Zero32 = B.buildConstant(S32, 0);
1973 
1974   // Extend back to 64-bits.
1975   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1976 
1977   auto Shr = B.buildAShr(S64, FractMask, Exp);
1978   auto Not = B.buildNot(S64, Shr);
1979   auto Tmp0 = B.buildAnd(S64, Src, Not);
1980   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1981 
1982   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1983   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1984 
1985   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1986   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987   MI.eraseFromParent();
1988   return true;
1989 }
1990 
1991 bool AMDGPULegalizerInfo::legalizeITOFP(
1992   MachineInstr &MI, MachineRegisterInfo &MRI,
1993   MachineIRBuilder &B, bool Signed) const {
1994 
1995   Register Dst = MI.getOperand(0).getReg();
1996   Register Src = MI.getOperand(1).getReg();
1997 
1998   const LLT S64 = LLT::scalar(64);
1999   const LLT S32 = LLT::scalar(32);
2000 
2001   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2002 
2003   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2004 
2005   auto CvtHi = Signed ?
2006     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2007     B.buildUITOFP(S64, Unmerge.getReg(1));
2008 
2009   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2010 
2011   auto ThirtyTwo = B.buildConstant(S32, 32);
2012   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2013     .addUse(CvtHi.getReg(0))
2014     .addUse(ThirtyTwo.getReg(0));
2015 
2016   // TODO: Should this propagate fast-math-flags?
2017   B.buildFAdd(Dst, LdExp, CvtLo);
2018   MI.eraseFromParent();
2019   return true;
2020 }
2021 
2022 // TODO: Copied from DAG implementation. Verify logic and document how this
2023 // actually works.
2024 bool AMDGPULegalizerInfo::legalizeFPTOI(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B, bool Signed) const {
2027 
2028   Register Dst = MI.getOperand(0).getReg();
2029   Register Src = MI.getOperand(1).getReg();
2030 
2031   const LLT S64 = LLT::scalar(64);
2032   const LLT S32 = LLT::scalar(32);
2033 
2034   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2035 
2036   unsigned Flags = MI.getFlags();
2037 
2038   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2039   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2040   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2041 
2042   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2043   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2044   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2045 
2046   auto Hi = Signed ?
2047     B.buildFPTOSI(S32, FloorMul) :
2048     B.buildFPTOUI(S32, FloorMul);
2049   auto Lo = B.buildFPTOUI(S32, Fma);
2050 
2051   B.buildMerge(Dst, { Lo, Hi });
2052   MI.eraseFromParent();
2053 
2054   return true;
2055 }
2056 
2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2058                                                MachineInstr &MI) const {
2059   MachineFunction &MF = Helper.MIRBuilder.getMF();
2060   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061 
2062   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2063                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2064 
2065   // With ieee_mode disabled, the instructions have the correct behavior
2066   // already for G_FMINNUM/G_FMAXNUM
2067   if (!MFI->getMode().IEEE)
2068     return !IsIEEEOp;
2069 
2070   if (IsIEEEOp)
2071     return true;
2072 
2073   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2074 }
2075 
2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2077   MachineInstr &MI, MachineRegisterInfo &MRI,
2078   MachineIRBuilder &B) const {
2079   // TODO: Should move some of this into LegalizerHelper.
2080 
2081   // TODO: Promote dynamic indexing of s16 to s32
2082 
2083   // FIXME: Artifact combiner probably should have replaced the truncated
2084   // constant before this, so we shouldn't need
2085   // getConstantVRegValWithLookThrough.
2086   Optional<ValueAndVReg> MaybeIdxVal =
2087       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2088   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2089     return true;
2090   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2091 
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register Vec = MI.getOperand(1).getReg();
2094 
2095   LLT VecTy = MRI.getType(Vec);
2096   LLT EltTy = VecTy.getElementType();
2097   assert(EltTy == MRI.getType(Dst));
2098 
2099   if (IdxVal < VecTy.getNumElements())
2100     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2101   else
2102     B.buildUndef(Dst);
2103 
2104   MI.eraseFromParent();
2105   return true;
2106 }
2107 
2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2109   MachineInstr &MI, MachineRegisterInfo &MRI,
2110   MachineIRBuilder &B) const {
2111   // TODO: Should move some of this into LegalizerHelper.
2112 
2113   // TODO: Promote dynamic indexing of s16 to s32
2114 
2115   // FIXME: Artifact combiner probably should have replaced the truncated
2116   // constant before this, so we shouldn't need
2117   // getConstantVRegValWithLookThrough.
2118   Optional<ValueAndVReg> MaybeIdxVal =
2119       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2120   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2121     return true;
2122 
2123   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2124   Register Dst = MI.getOperand(0).getReg();
2125   Register Vec = MI.getOperand(1).getReg();
2126   Register Ins = MI.getOperand(2).getReg();
2127 
2128   LLT VecTy = MRI.getType(Vec);
2129   LLT EltTy = VecTy.getElementType();
2130   assert(EltTy == MRI.getType(Ins));
2131 
2132   if (IdxVal < VecTy.getNumElements())
2133     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2134   else
2135     B.buildUndef(Dst);
2136 
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2142   MachineInstr &MI, MachineRegisterInfo &MRI,
2143   MachineIRBuilder &B) const {
2144   const LLT V2S16 = LLT::vector(2, 16);
2145 
2146   Register Dst = MI.getOperand(0).getReg();
2147   Register Src0 = MI.getOperand(1).getReg();
2148   LLT DstTy = MRI.getType(Dst);
2149   LLT SrcTy = MRI.getType(Src0);
2150 
2151   if (SrcTy == V2S16 && DstTy == V2S16 &&
2152       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2153     return true;
2154 
2155   MachineIRBuilder HelperBuilder(MI);
2156   GISelObserverWrapper DummyObserver;
2157   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2158   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2159 }
2160 
2161 bool AMDGPULegalizerInfo::legalizeSinCos(
2162   MachineInstr &MI, MachineRegisterInfo &MRI,
2163   MachineIRBuilder &B) const {
2164 
2165   Register DstReg = MI.getOperand(0).getReg();
2166   Register SrcReg = MI.getOperand(1).getReg();
2167   LLT Ty = MRI.getType(DstReg);
2168   unsigned Flags = MI.getFlags();
2169 
2170   Register TrigVal;
2171   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2172   if (ST.hasTrigReducedRange()) {
2173     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2174     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2175       .addUse(MulVal.getReg(0))
2176       .setMIFlags(Flags).getReg(0);
2177   } else
2178     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2179 
2180   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2181     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2182   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2183     .addUse(TrigVal)
2184     .setMIFlags(Flags);
2185   MI.eraseFromParent();
2186   return true;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2190                                                   MachineIRBuilder &B,
2191                                                   const GlobalValue *GV,
2192                                                   int64_t Offset,
2193                                                   unsigned GAFlags) const {
2194   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2195   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2196   // to the following code sequence:
2197   //
2198   // For constant address space:
2199   //   s_getpc_b64 s[0:1]
2200   //   s_add_u32 s0, s0, $symbol
2201   //   s_addc_u32 s1, s1, 0
2202   //
2203   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204   //   a fixup or relocation is emitted to replace $symbol with a literal
2205   //   constant, which is a pc-relative offset from the encoding of the $symbol
2206   //   operand to the global variable.
2207   //
2208   // For global address space:
2209   //   s_getpc_b64 s[0:1]
2210   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2211   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2212   //
2213   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2214   //   fixups or relocations are emitted to replace $symbol@*@lo and
2215   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2216   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2217   //   operand to the global variable.
2218   //
2219   // What we want here is an offset from the value returned by s_getpc
2220   // (which is the address of the s_add_u32 instruction) to the global
2221   // variable, but since the encoding of $symbol starts 4 bytes after the start
2222   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2223   // small. This requires us to add 4 to the global variable offset in order to
2224   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2225   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2226   // instruction.
2227 
2228   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2229 
2230   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2231     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2232 
2233   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2234     .addDef(PCReg);
2235 
2236   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2237   if (GAFlags == SIInstrInfo::MO_NONE)
2238     MIB.addImm(0);
2239   else
2240     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2241 
2242   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2243 
2244   if (PtrTy.getSizeInBits() == 32)
2245     B.buildExtract(DstReg, PCReg, 0);
2246   return true;
2247  }
2248 
2249 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2250   MachineInstr &MI, MachineRegisterInfo &MRI,
2251   MachineIRBuilder &B) const {
2252   Register DstReg = MI.getOperand(0).getReg();
2253   LLT Ty = MRI.getType(DstReg);
2254   unsigned AS = Ty.getAddressSpace();
2255 
2256   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2257   MachineFunction &MF = B.getMF();
2258   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2259 
2260   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2261     if (!MFI->isModuleEntryFunction()) {
2262       const Function &Fn = MF.getFunction();
2263       DiagnosticInfoUnsupported BadLDSDecl(
2264         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2265         DS_Warning);
2266       Fn.getContext().diagnose(BadLDSDecl);
2267 
2268       // We currently don't have a way to correctly allocate LDS objects that
2269       // aren't directly associated with a kernel. We do force inlining of
2270       // functions that use local objects. However, if these dead functions are
2271       // not eliminated, we don't want a compile time error. Just emit a warning
2272       // and a trap, since there should be no callable path here.
2273       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2274       B.buildUndef(DstReg);
2275       MI.eraseFromParent();
2276       return true;
2277     }
2278 
2279     // TODO: We could emit code to handle the initialization somewhere.
2280     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2281       const SITargetLowering *TLI = ST.getTargetLowering();
2282       if (!TLI->shouldUseLDSConstAddress(GV)) {
2283         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2284         return true; // Leave in place;
2285       }
2286 
2287       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2288         Type *Ty = GV->getValueType();
2289         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2290         // zero-sized type in other languages to declare the dynamic shared
2291         // memory which size is not known at the compile time. They will be
2292         // allocated by the runtime and placed directly after the static
2293         // allocated ones. They all share the same offset.
2294         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2295           // Adjust alignment for that dynamic shared memory array.
2296           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2297           LLT S32 = LLT::scalar(32);
2298           auto Sz =
2299               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2300           B.buildIntToPtr(DstReg, Sz);
2301           MI.eraseFromParent();
2302           return true;
2303         }
2304       }
2305 
2306       B.buildConstant(
2307           DstReg,
2308           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2309       MI.eraseFromParent();
2310       return true;
2311     }
2312 
2313     const Function &Fn = MF.getFunction();
2314     DiagnosticInfoUnsupported BadInit(
2315       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2316     Fn.getContext().diagnose(BadInit);
2317     return true;
2318   }
2319 
2320   const SITargetLowering *TLI = ST.getTargetLowering();
2321 
2322   if (TLI->shouldEmitFixup(GV)) {
2323     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2324     MI.eraseFromParent();
2325     return true;
2326   }
2327 
2328   if (TLI->shouldEmitPCReloc(GV)) {
2329     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2330     MI.eraseFromParent();
2331     return true;
2332   }
2333 
2334   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2335   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2336 
2337   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2338       MachinePointerInfo::getGOT(MF),
2339       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2340           MachineMemOperand::MOInvariant,
2341       8 /*Size*/, Align(8));
2342 
2343   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2344 
2345   if (Ty.getSizeInBits() == 32) {
2346     // Truncate if this is a 32-bit constant adrdess.
2347     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2348     B.buildExtract(DstReg, Load, 0);
2349   } else
2350     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2351 
2352   MI.eraseFromParent();
2353   return true;
2354 }
2355 
2356 static LLT widenToNextPowerOf2(LLT Ty) {
2357   if (Ty.isVector())
2358     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2359   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2360 }
2361 
2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2363                                        MachineInstr &MI) const {
2364   MachineIRBuilder &B = Helper.MIRBuilder;
2365   MachineRegisterInfo &MRI = *B.getMRI();
2366   GISelChangeObserver &Observer = Helper.Observer;
2367 
2368   Register PtrReg = MI.getOperand(1).getReg();
2369   LLT PtrTy = MRI.getType(PtrReg);
2370   unsigned AddrSpace = PtrTy.getAddressSpace();
2371 
2372   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2373     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2374     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2375     Observer.changingInstr(MI);
2376     MI.getOperand(1).setReg(Cast.getReg(0));
2377     Observer.changedInstr(MI);
2378     return true;
2379   }
2380 
2381   Register ValReg = MI.getOperand(0).getReg();
2382   LLT ValTy = MRI.getType(ValReg);
2383 
2384   MachineMemOperand *MMO = *MI.memoperands_begin();
2385   const unsigned ValSize = ValTy.getSizeInBits();
2386   const unsigned MemSize = 8 * MMO->getSize();
2387   const Align MemAlign = MMO->getAlign();
2388   const unsigned AlignInBits = 8 * MemAlign.value();
2389 
2390   // Widen non-power-of-2 loads to the alignment if needed
2391   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2392     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2393 
2394     // This was already the correct extending load result type, so just adjust
2395     // the memory type.
2396     if (WideMemSize == ValSize) {
2397       MachineFunction &MF = B.getMF();
2398 
2399       MachineMemOperand *WideMMO =
2400           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2401       Observer.changingInstr(MI);
2402       MI.setMemRefs(MF, {WideMMO});
2403       Observer.changedInstr(MI);
2404       return true;
2405     }
2406 
2407     // Don't bother handling edge case that should probably never be produced.
2408     if (ValSize > WideMemSize)
2409       return false;
2410 
2411     LLT WideTy = widenToNextPowerOf2(ValTy);
2412 
2413     Register WideLoad;
2414     if (!WideTy.isVector()) {
2415       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416       B.buildTrunc(ValReg, WideLoad).getReg(0);
2417     } else {
2418       // Extract the subvector.
2419 
2420       if (isRegisterType(ValTy)) {
2421         // If this a case where G_EXTRACT is legal, use it.
2422         // (e.g. <3 x s32> -> <4 x s32>)
2423         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424         B.buildExtract(ValReg, WideLoad, 0);
2425       } else {
2426         // For cases where the widened type isn't a nice register value, unmerge
2427         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2428         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2429         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2430         B.setInsertPt(B.getMBB(), MI.getIterator());
2431         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2432       }
2433     }
2434 
2435     MI.eraseFromParent();
2436     return true;
2437   }
2438 
2439   return false;
2440 }
2441 
2442 bool AMDGPULegalizerInfo::legalizeFMad(
2443   MachineInstr &MI, MachineRegisterInfo &MRI,
2444   MachineIRBuilder &B) const {
2445   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2446   assert(Ty.isScalar());
2447 
2448   MachineFunction &MF = B.getMF();
2449   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2450 
2451   // TODO: Always legal with future ftz flag.
2452   // FIXME: Do we need just output?
2453   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2454     return true;
2455   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2456     return true;
2457 
2458   MachineIRBuilder HelperBuilder(MI);
2459   GISelObserverWrapper DummyObserver;
2460   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2461   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2462 }
2463 
2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2465   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2466   Register DstReg = MI.getOperand(0).getReg();
2467   Register PtrReg = MI.getOperand(1).getReg();
2468   Register CmpVal = MI.getOperand(2).getReg();
2469   Register NewVal = MI.getOperand(3).getReg();
2470 
2471   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2472          "this should not have been custom lowered");
2473 
2474   LLT ValTy = MRI.getType(CmpVal);
2475   LLT VecTy = LLT::vector(2, ValTy);
2476 
2477   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2478 
2479   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2480     .addDef(DstReg)
2481     .addUse(PtrReg)
2482     .addUse(PackedVal)
2483     .setMemRefs(MI.memoperands());
2484 
2485   MI.eraseFromParent();
2486   return true;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizeFlog(
2490   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2491   Register Dst = MI.getOperand(0).getReg();
2492   Register Src = MI.getOperand(1).getReg();
2493   LLT Ty = B.getMRI()->getType(Dst);
2494   unsigned Flags = MI.getFlags();
2495 
2496   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2497   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2498 
2499   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2505                                        MachineIRBuilder &B) const {
2506   Register Dst = MI.getOperand(0).getReg();
2507   Register Src = MI.getOperand(1).getReg();
2508   unsigned Flags = MI.getFlags();
2509   LLT Ty = B.getMRI()->getType(Dst);
2510 
2511   auto K = B.buildFConstant(Ty, numbers::log2e);
2512   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2513   B.buildFExp2(Dst, Mul, Flags);
2514   MI.eraseFromParent();
2515   return true;
2516 }
2517 
2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2519                                        MachineIRBuilder &B) const {
2520   Register Dst = MI.getOperand(0).getReg();
2521   Register Src0 = MI.getOperand(1).getReg();
2522   Register Src1 = MI.getOperand(2).getReg();
2523   unsigned Flags = MI.getFlags();
2524   LLT Ty = B.getMRI()->getType(Dst);
2525   const LLT S16 = LLT::scalar(16);
2526   const LLT S32 = LLT::scalar(32);
2527 
2528   if (Ty == S32) {
2529     auto Log = B.buildFLog2(S32, Src0, Flags);
2530     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531       .addUse(Log.getReg(0))
2532       .addUse(Src1)
2533       .setMIFlags(Flags);
2534     B.buildFExp2(Dst, Mul, Flags);
2535   } else if (Ty == S16) {
2536     // There's no f16 fmul_legacy, so we need to convert for it.
2537     auto Log = B.buildFLog2(S16, Src0, Flags);
2538     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2539     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2540     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2541       .addUse(Ext0.getReg(0))
2542       .addUse(Ext1.getReg(0))
2543       .setMIFlags(Flags);
2544 
2545     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2546   } else
2547     return false;
2548 
2549   MI.eraseFromParent();
2550   return true;
2551 }
2552 
2553 // Find a source register, ignoring any possible source modifiers.
2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2555   Register ModSrc = OrigSrc;
2556   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2557     ModSrc = SrcFNeg->getOperand(1).getReg();
2558     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2559       ModSrc = SrcFAbs->getOperand(1).getReg();
2560   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561     ModSrc = SrcFAbs->getOperand(1).getReg();
2562   return ModSrc;
2563 }
2564 
2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2566                                          MachineRegisterInfo &MRI,
2567                                          MachineIRBuilder &B) const {
2568 
2569   const LLT S1 = LLT::scalar(1);
2570   const LLT S64 = LLT::scalar(64);
2571   Register Dst = MI.getOperand(0).getReg();
2572   Register OrigSrc = MI.getOperand(1).getReg();
2573   unsigned Flags = MI.getFlags();
2574   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2575          "this should not have been custom lowered");
2576 
2577   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2578   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2579   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2580   // V_FRACT bug is:
2581   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2582   //
2583   // Convert floor(x) to (x - fract(x))
2584 
2585   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2586     .addUse(OrigSrc)
2587     .setMIFlags(Flags);
2588 
2589   // Give source modifier matching some assistance before obscuring a foldable
2590   // pattern.
2591 
2592   // TODO: We can avoid the neg on the fract? The input sign to fract
2593   // shouldn't matter?
2594   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2595 
2596   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2597 
2598   Register Min = MRI.createGenericVirtualRegister(S64);
2599 
2600   // We don't need to concern ourselves with the snan handling difference, so
2601   // use the one which will directly select.
2602   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2603   if (MFI->getMode().IEEE)
2604     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2605   else
2606     B.buildFMinNum(Min, Fract, Const, Flags);
2607 
2608   Register CorrectedFract = Min;
2609   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2610     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2611     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2612   }
2613 
2614   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2615   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2616 
2617   MI.eraseFromParent();
2618   return true;
2619 }
2620 
2621 // Turn an illegal packed v2s16 build vector into bit operations.
2622 // TODO: This should probably be a bitcast action in LegalizerHelper.
2623 bool AMDGPULegalizerInfo::legalizeBuildVector(
2624   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2625   Register Dst = MI.getOperand(0).getReg();
2626   const LLT S32 = LLT::scalar(32);
2627   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2628 
2629   Register Src0 = MI.getOperand(1).getReg();
2630   Register Src1 = MI.getOperand(2).getReg();
2631   assert(MRI.getType(Src0) == LLT::scalar(16));
2632 
2633   auto Merge = B.buildMerge(S32, {Src0, Src1});
2634   B.buildBitcast(Dst, Merge);
2635 
2636   MI.eraseFromParent();
2637   return true;
2638 }
2639 
2640 // Check that this is a G_XOR x, -1
2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2642   if (MI.getOpcode() != TargetOpcode::G_XOR)
2643     return false;
2644   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2645   return ConstVal && *ConstVal == -1;
2646 }
2647 
2648 // Return the use branch instruction, otherwise null if the usage is invalid.
2649 static MachineInstr *
2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2651                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2652   Register CondDef = MI.getOperand(0).getReg();
2653   if (!MRI.hasOneNonDBGUse(CondDef))
2654     return nullptr;
2655 
2656   MachineBasicBlock *Parent = MI.getParent();
2657   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2658 
2659   if (isNot(MRI, *UseMI)) {
2660     Register NegatedCond = UseMI->getOperand(0).getReg();
2661     if (!MRI.hasOneNonDBGUse(NegatedCond))
2662       return nullptr;
2663 
2664     // We're deleting the def of this value, so we need to remove it.
2665     UseMI->eraseFromParent();
2666 
2667     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2668     Negated = true;
2669   }
2670 
2671   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2672     return nullptr;
2673 
2674   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2675   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2676   if (Next == Parent->end()) {
2677     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2678     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2679       return nullptr;
2680     UncondBrTarget = &*NextMBB;
2681   } else {
2682     if (Next->getOpcode() != AMDGPU::G_BR)
2683       return nullptr;
2684     Br = &*Next;
2685     UncondBrTarget = Br->getOperand(0).getMBB();
2686   }
2687 
2688   return UseMI;
2689 }
2690 
2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2692                                          const ArgDescriptor *Arg,
2693                                          const TargetRegisterClass *ArgRC,
2694                                          LLT ArgTy) const {
2695   MCRegister SrcReg = Arg->getRegister();
2696   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2697   assert(DstReg.isVirtual() && "Virtual register expected");
2698 
2699   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2700                                              ArgTy);
2701   if (Arg->isMasked()) {
2702     // TODO: Should we try to emit this once in the entry block?
2703     const LLT S32 = LLT::scalar(32);
2704     const unsigned Mask = Arg->getMask();
2705     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2706 
2707     Register AndMaskSrc = LiveIn;
2708 
2709     if (Shift != 0) {
2710       auto ShiftAmt = B.buildConstant(S32, Shift);
2711       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2712     }
2713 
2714     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2715   } else {
2716     B.buildCopy(DstReg, LiveIn);
2717   }
2718 
2719   return true;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::loadInputValue(
2723     Register DstReg, MachineIRBuilder &B,
2724     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2725   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2726   const ArgDescriptor *Arg;
2727   const TargetRegisterClass *ArgRC;
2728   LLT ArgTy;
2729   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2730 
2731   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2732     return false; // TODO: Handle these
2733   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2734 }
2735 
2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2737     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2738     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2739   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2740     return false;
2741 
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2747                                        MachineRegisterInfo &MRI,
2748                                        MachineIRBuilder &B) const {
2749   Register Dst = MI.getOperand(0).getReg();
2750   LLT DstTy = MRI.getType(Dst);
2751   LLT S16 = LLT::scalar(16);
2752   LLT S32 = LLT::scalar(32);
2753   LLT S64 = LLT::scalar(64);
2754 
2755   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2756     return true;
2757 
2758   if (DstTy == S16)
2759     return legalizeFDIV16(MI, MRI, B);
2760   if (DstTy == S32)
2761     return legalizeFDIV32(MI, MRI, B);
2762   if (DstTy == S64)
2763     return legalizeFDIV64(MI, MRI, B);
2764 
2765   return false;
2766 }
2767 
2768 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2769                                                   Register DstReg,
2770                                                   Register X,
2771                                                   Register Y,
2772                                                   bool IsDiv) const {
2773   const LLT S1 = LLT::scalar(1);
2774   const LLT S32 = LLT::scalar(32);
2775 
2776   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2777   // algorithm used here.
2778 
2779   // Initial estimate of inv(y).
2780   auto FloatY = B.buildUITOFP(S32, Y);
2781   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2782   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2783   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2784   auto Z = B.buildFPTOUI(S32, ScaledY);
2785 
2786   // One round of UNR.
2787   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2788   auto NegYZ = B.buildMul(S32, NegY, Z);
2789   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2790 
2791   // Quotient/remainder estimate.
2792   auto Q = B.buildUMulH(S32, X, Z);
2793   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2794 
2795   // First quotient/remainder refinement.
2796   auto One = B.buildConstant(S32, 1);
2797   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2798   if (IsDiv)
2799     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2800   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2801 
2802   // Second quotient/remainder refinement.
2803   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2804   if (IsDiv)
2805     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2806   else
2807     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2808 }
2809 
2810 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2811                                               MachineRegisterInfo &MRI,
2812                                               MachineIRBuilder &B) const {
2813   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2814   Register DstReg = MI.getOperand(0).getReg();
2815   Register Num = MI.getOperand(1).getReg();
2816   Register Den = MI.getOperand(2).getReg();
2817   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2818   MI.eraseFromParent();
2819   return true;
2820 }
2821 
2822 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2823 //
2824 // Return lo, hi of result
2825 //
2826 // %cvt.lo = G_UITOFP Val.lo
2827 // %cvt.hi = G_UITOFP Val.hi
2828 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2829 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2830 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2831 // %mul2 = G_FMUL %mul1, 2**(-32)
2832 // %trunc = G_INTRINSIC_TRUNC %mul2
2833 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2834 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2835 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2836                                                        Register Val) {
2837   const LLT S32 = LLT::scalar(32);
2838   auto Unmerge = B.buildUnmerge(S32, Val);
2839 
2840   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2841   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2842 
2843   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2844                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2845 
2846   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2847   auto Mul1 =
2848       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2849 
2850   // 2**(-32)
2851   auto Mul2 =
2852       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2853   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2854 
2855   // -(2**32)
2856   auto Mad2 = B.buildFMAD(S32, Trunc,
2857                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2858 
2859   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2860   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2861 
2862   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2863 }
2864 
2865 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2866                                                   Register DstReg,
2867                                                   Register Numer,
2868                                                   Register Denom,
2869                                                   bool IsDiv) const {
2870   const LLT S32 = LLT::scalar(32);
2871   const LLT S64 = LLT::scalar(64);
2872   const LLT S1 = LLT::scalar(1);
2873   Register RcpLo, RcpHi;
2874 
2875   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2876 
2877   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2878 
2879   auto Zero64 = B.buildConstant(S64, 0);
2880   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2881 
2882   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2883   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2884 
2885   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2886   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2887   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2888 
2889   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2890   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2891   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2892   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2893 
2894   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2895   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2896   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2897   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2898   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2899 
2900   auto Zero32 = B.buildConstant(S32, 0);
2901   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2902   auto Add2_HiC =
2903       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2904   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2905   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2906 
2907   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2908   Register NumerLo = UnmergeNumer.getReg(0);
2909   Register NumerHi = UnmergeNumer.getReg(1);
2910 
2911   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2912   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2913   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2914   Register Mul3_Lo = UnmergeMul3.getReg(0);
2915   Register Mul3_Hi = UnmergeMul3.getReg(1);
2916   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2917   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2918   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2919   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2920 
2921   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2922   Register DenomLo = UnmergeDenom.getReg(0);
2923   Register DenomHi = UnmergeDenom.getReg(1);
2924 
2925   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2926   auto C1 = B.buildSExt(S32, CmpHi);
2927 
2928   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2929   auto C2 = B.buildSExt(S32, CmpLo);
2930 
2931   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2932   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2933 
2934   // TODO: Here and below portions of the code can be enclosed into if/endif.
2935   // Currently control flow is unconditional and we have 4 selects after
2936   // potential endif to substitute PHIs.
2937 
2938   // if C3 != 0 ...
2939   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2940   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2941   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2942   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2943 
2944   auto One64 = B.buildConstant(S64, 1);
2945   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2946 
2947   auto C4 =
2948       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2949   auto C5 =
2950       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2951   auto C6 = B.buildSelect(
2952       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2953 
2954   // if (C6 != 0)
2955   auto Add4 = B.buildAdd(S64, Add3, One64);
2956   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2957 
2958   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2959   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2960   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2961 
2962   // endif C6
2963   // endif C3
2964 
2965   if (IsDiv) {
2966     auto Sel1 = B.buildSelect(
2967         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2968     B.buildSelect(DstReg,
2969                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2970   } else {
2971     auto Sel2 = B.buildSelect(
2972         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2973     B.buildSelect(DstReg,
2974                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2975   }
2976 }
2977 
2978 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2979                                             MachineRegisterInfo &MRI,
2980                                             MachineIRBuilder &B) const {
2981   const LLT S64 = LLT::scalar(64);
2982   const LLT S32 = LLT::scalar(32);
2983   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2984   Register DstReg = MI.getOperand(0).getReg();
2985   Register Num = MI.getOperand(1).getReg();
2986   Register Den = MI.getOperand(2).getReg();
2987   LLT Ty = MRI.getType(DstReg);
2988 
2989   if (Ty == S32)
2990     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2991   else if (Ty == S64)
2992     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2993   else
2994     return false;
2995 
2996   MI.eraseFromParent();
2997   return true;
2998 
2999 }
3000 
3001 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3002                                             MachineRegisterInfo &MRI,
3003                                             MachineIRBuilder &B) const {
3004   const LLT S64 = LLT::scalar(64);
3005   const LLT S32 = LLT::scalar(32);
3006 
3007   Register DstReg = MI.getOperand(0).getReg();
3008   const LLT Ty = MRI.getType(DstReg);
3009   if (Ty != S32 && Ty != S64)
3010     return false;
3011 
3012   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3013 
3014   Register LHS = MI.getOperand(1).getReg();
3015   Register RHS = MI.getOperand(2).getReg();
3016 
3017   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3018   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3019   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3020 
3021   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3022   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3023 
3024   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3025   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3026 
3027   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3028   if (Ty == S32)
3029     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3030   else
3031     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3032 
3033   Register Sign;
3034   if (IsDiv)
3035     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3036   else
3037     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3038 
3039   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3040   B.buildSub(DstReg, UDivRem, Sign);
3041 
3042   MI.eraseFromParent();
3043   return true;
3044 }
3045 
3046 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3047                                                  MachineRegisterInfo &MRI,
3048                                                  MachineIRBuilder &B) const {
3049   Register Res = MI.getOperand(0).getReg();
3050   Register LHS = MI.getOperand(1).getReg();
3051   Register RHS = MI.getOperand(2).getReg();
3052   uint16_t Flags = MI.getFlags();
3053   LLT ResTy = MRI.getType(Res);
3054 
3055   const MachineFunction &MF = B.getMF();
3056   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3057                             MI.getFlag(MachineInstr::FmAfn);
3058 
3059   if (!AllowInaccurateRcp)
3060     return false;
3061 
3062   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3063     // 1 / x -> RCP(x)
3064     if (CLHS->isExactlyValue(1.0)) {
3065       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3066         .addUse(RHS)
3067         .setMIFlags(Flags);
3068 
3069       MI.eraseFromParent();
3070       return true;
3071     }
3072 
3073     // -1 / x -> RCP( FNEG(x) )
3074     if (CLHS->isExactlyValue(-1.0)) {
3075       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3076       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3077         .addUse(FNeg.getReg(0))
3078         .setMIFlags(Flags);
3079 
3080       MI.eraseFromParent();
3081       return true;
3082     }
3083   }
3084 
3085   // x / y -> x * (1.0 / y)
3086   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3087     .addUse(RHS)
3088     .setMIFlags(Flags);
3089   B.buildFMul(Res, LHS, RCP, Flags);
3090 
3091   MI.eraseFromParent();
3092   return true;
3093 }
3094 
3095 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3096                                          MachineRegisterInfo &MRI,
3097                                          MachineIRBuilder &B) const {
3098   Register Res = MI.getOperand(0).getReg();
3099   Register LHS = MI.getOperand(1).getReg();
3100   Register RHS = MI.getOperand(2).getReg();
3101 
3102   uint16_t Flags = MI.getFlags();
3103 
3104   LLT S16 = LLT::scalar(16);
3105   LLT S32 = LLT::scalar(32);
3106 
3107   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3108   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3109 
3110   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3111     .addUse(RHSExt.getReg(0))
3112     .setMIFlags(Flags);
3113 
3114   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3115   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3116 
3117   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3118     .addUse(RDst.getReg(0))
3119     .addUse(RHS)
3120     .addUse(LHS)
3121     .setMIFlags(Flags);
3122 
3123   MI.eraseFromParent();
3124   return true;
3125 }
3126 
3127 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3128 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3129 static void toggleSPDenormMode(bool Enable,
3130                                MachineIRBuilder &B,
3131                                const GCNSubtarget &ST,
3132                                AMDGPU::SIModeRegisterDefaults Mode) {
3133   // Set SP denorm mode to this value.
3134   unsigned SPDenormMode =
3135     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3136 
3137   if (ST.hasDenormModeInst()) {
3138     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3139     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3140 
3141     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3142     B.buildInstr(AMDGPU::S_DENORM_MODE)
3143       .addImm(NewDenormModeValue);
3144 
3145   } else {
3146     // Select FP32 bit field in mode register.
3147     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3148                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3149                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3150 
3151     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3152       .addImm(SPDenormMode)
3153       .addImm(SPDenormModeBitField);
3154   }
3155 }
3156 
3157 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3158                                          MachineRegisterInfo &MRI,
3159                                          MachineIRBuilder &B) const {
3160   Register Res = MI.getOperand(0).getReg();
3161   Register LHS = MI.getOperand(1).getReg();
3162   Register RHS = MI.getOperand(2).getReg();
3163   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3164   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3165 
3166   uint16_t Flags = MI.getFlags();
3167 
3168   LLT S32 = LLT::scalar(32);
3169   LLT S1 = LLT::scalar(1);
3170 
3171   auto One = B.buildFConstant(S32, 1.0f);
3172 
3173   auto DenominatorScaled =
3174     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3175       .addUse(LHS)
3176       .addUse(RHS)
3177       .addImm(0)
3178       .setMIFlags(Flags);
3179   auto NumeratorScaled =
3180     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3181       .addUse(LHS)
3182       .addUse(RHS)
3183       .addImm(1)
3184       .setMIFlags(Flags);
3185 
3186   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3187     .addUse(DenominatorScaled.getReg(0))
3188     .setMIFlags(Flags);
3189   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3190 
3191   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3192   // aren't modeled as reading it.
3193   if (!Mode.allFP32Denormals())
3194     toggleSPDenormMode(true, B, ST, Mode);
3195 
3196   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3197   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3198   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3199   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3200   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3201   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3202 
3203   if (!Mode.allFP32Denormals())
3204     toggleSPDenormMode(false, B, ST, Mode);
3205 
3206   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3207     .addUse(Fma4.getReg(0))
3208     .addUse(Fma1.getReg(0))
3209     .addUse(Fma3.getReg(0))
3210     .addUse(NumeratorScaled.getReg(1))
3211     .setMIFlags(Flags);
3212 
3213   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3214     .addUse(Fmas.getReg(0))
3215     .addUse(RHS)
3216     .addUse(LHS)
3217     .setMIFlags(Flags);
3218 
3219   MI.eraseFromParent();
3220   return true;
3221 }
3222 
3223 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3224                                          MachineRegisterInfo &MRI,
3225                                          MachineIRBuilder &B) const {
3226   Register Res = MI.getOperand(0).getReg();
3227   Register LHS = MI.getOperand(1).getReg();
3228   Register RHS = MI.getOperand(2).getReg();
3229 
3230   uint16_t Flags = MI.getFlags();
3231 
3232   LLT S64 = LLT::scalar(64);
3233   LLT S1 = LLT::scalar(1);
3234 
3235   auto One = B.buildFConstant(S64, 1.0);
3236 
3237   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3238     .addUse(LHS)
3239     .addUse(RHS)
3240     .addImm(0)
3241     .setMIFlags(Flags);
3242 
3243   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3244 
3245   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3246     .addUse(DivScale0.getReg(0))
3247     .setMIFlags(Flags);
3248 
3249   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3250   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3251   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3252 
3253   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3254     .addUse(LHS)
3255     .addUse(RHS)
3256     .addImm(1)
3257     .setMIFlags(Flags);
3258 
3259   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3260   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3261   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3262 
3263   Register Scale;
3264   if (!ST.hasUsableDivScaleConditionOutput()) {
3265     // Workaround a hardware bug on SI where the condition output from div_scale
3266     // is not usable.
3267 
3268     LLT S32 = LLT::scalar(32);
3269 
3270     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3271     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3272     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3273     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3274 
3275     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3276                               Scale1Unmerge.getReg(1));
3277     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3278                               Scale0Unmerge.getReg(1));
3279     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3280   } else {
3281     Scale = DivScale1.getReg(1);
3282   }
3283 
3284   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3285     .addUse(Fma4.getReg(0))
3286     .addUse(Fma3.getReg(0))
3287     .addUse(Mul.getReg(0))
3288     .addUse(Scale)
3289     .setMIFlags(Flags);
3290 
3291   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3292     .addUse(Fmas.getReg(0))
3293     .addUse(RHS)
3294     .addUse(LHS)
3295     .setMIFlags(Flags);
3296 
3297   MI.eraseFromParent();
3298   return true;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3302                                                  MachineRegisterInfo &MRI,
3303                                                  MachineIRBuilder &B) const {
3304   Register Res = MI.getOperand(0).getReg();
3305   Register LHS = MI.getOperand(2).getReg();
3306   Register RHS = MI.getOperand(3).getReg();
3307   uint16_t Flags = MI.getFlags();
3308 
3309   LLT S32 = LLT::scalar(32);
3310   LLT S1 = LLT::scalar(1);
3311 
3312   auto Abs = B.buildFAbs(S32, RHS, Flags);
3313   const APFloat C0Val(1.0f);
3314 
3315   auto C0 = B.buildConstant(S32, 0x6f800000);
3316   auto C1 = B.buildConstant(S32, 0x2f800000);
3317   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3318 
3319   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3320   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3321 
3322   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3323 
3324   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3325     .addUse(Mul0.getReg(0))
3326     .setMIFlags(Flags);
3327 
3328   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3329 
3330   B.buildFMul(Res, Sel, Mul1, Flags);
3331 
3332   MI.eraseFromParent();
3333   return true;
3334 }
3335 
3336 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3337 // FIXME: Why do we handle this one but not other removed instructions?
3338 //
3339 // Reciprocal square root.  The clamp prevents infinite results, clamping
3340 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3341 // +-max_float.
3342 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3343                                                     MachineRegisterInfo &MRI,
3344                                                     MachineIRBuilder &B) const {
3345   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3346     return true;
3347 
3348   Register Dst = MI.getOperand(0).getReg();
3349   Register Src = MI.getOperand(2).getReg();
3350   auto Flags = MI.getFlags();
3351 
3352   LLT Ty = MRI.getType(Dst);
3353 
3354   const fltSemantics *FltSemantics;
3355   if (Ty == LLT::scalar(32))
3356     FltSemantics = &APFloat::IEEEsingle();
3357   else if (Ty == LLT::scalar(64))
3358     FltSemantics = &APFloat::IEEEdouble();
3359   else
3360     return false;
3361 
3362   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3363     .addUse(Src)
3364     .setMIFlags(Flags);
3365 
3366   // We don't need to concern ourselves with the snan handling difference, since
3367   // the rsq quieted (or not) so use the one which will directly select.
3368   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3369   const bool UseIEEE = MFI->getMode().IEEE;
3370 
3371   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3372   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3373                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3374 
3375   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3376 
3377   if (UseIEEE)
3378     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3379   else
3380     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3381   MI.eraseFromParent();
3382   return true;
3383 }
3384 
3385 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3386   switch (IID) {
3387   case Intrinsic::amdgcn_ds_fadd:
3388     return AMDGPU::G_ATOMICRMW_FADD;
3389   case Intrinsic::amdgcn_ds_fmin:
3390     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3391   case Intrinsic::amdgcn_ds_fmax:
3392     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3393   default:
3394     llvm_unreachable("not a DS FP intrinsic");
3395   }
3396 }
3397 
3398 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3399                                                       MachineInstr &MI,
3400                                                       Intrinsic::ID IID) const {
3401   GISelChangeObserver &Observer = Helper.Observer;
3402   Observer.changingInstr(MI);
3403 
3404   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3405 
3406   // The remaining operands were used to set fields in the MemOperand on
3407   // construction.
3408   for (int I = 6; I > 3; --I)
3409     MI.RemoveOperand(I);
3410 
3411   MI.RemoveOperand(1); // Remove the intrinsic ID.
3412   Observer.changedInstr(MI);
3413   return true;
3414 }
3415 
3416 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3417                                             MachineRegisterInfo &MRI,
3418                                             MachineIRBuilder &B) const {
3419   uint64_t Offset =
3420     ST.getTargetLowering()->getImplicitParameterOffset(
3421       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3422   LLT DstTy = MRI.getType(DstReg);
3423   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3424 
3425   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3426   if (!loadInputValue(KernargPtrReg, B,
3427                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3428     return false;
3429 
3430   // FIXME: This should be nuw
3431   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3432   return true;
3433 }
3434 
3435 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3436                                                  MachineRegisterInfo &MRI,
3437                                                  MachineIRBuilder &B) const {
3438   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3439   if (!MFI->isEntryFunction()) {
3440     return legalizePreloadedArgIntrin(MI, MRI, B,
3441                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3442   }
3443 
3444   Register DstReg = MI.getOperand(0).getReg();
3445   if (!getImplicitArgPtr(DstReg, MRI, B))
3446     return false;
3447 
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3453                                               MachineRegisterInfo &MRI,
3454                                               MachineIRBuilder &B,
3455                                               unsigned AddrSpace) const {
3456   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3457   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3458   Register Hi32 = Unmerge.getReg(1);
3459 
3460   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3461   MI.eraseFromParent();
3462   return true;
3463 }
3464 
3465 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3466 // offset (the offset that is included in bounds checking and swizzling, to be
3467 // split between the instruction's voffset and immoffset fields) and soffset
3468 // (the offset that is excluded from bounds checking and swizzling, to go in
3469 // the instruction's soffset field).  This function takes the first kind of
3470 // offset and figures out how to split it between voffset and immoffset.
3471 std::tuple<Register, unsigned, unsigned>
3472 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3473                                         Register OrigOffset) const {
3474   const unsigned MaxImm = 4095;
3475   Register BaseReg;
3476   unsigned TotalConstOffset;
3477   const LLT S32 = LLT::scalar(32);
3478 
3479   std::tie(BaseReg, TotalConstOffset) =
3480       AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3481 
3482   unsigned ImmOffset = TotalConstOffset;
3483 
3484   // If the immediate value is too big for the immoffset field, put the value
3485   // and -4096 into the immoffset field so that the value that is copied/added
3486   // for the voffset field is a multiple of 4096, and it stands more chance
3487   // of being CSEd with the copy/add for another similar load/store.
3488   // However, do not do that rounding down to a multiple of 4096 if that is a
3489   // negative number, as it appears to be illegal to have a negative offset
3490   // in the vgpr, even if adding the immediate offset makes it positive.
3491   unsigned Overflow = ImmOffset & ~MaxImm;
3492   ImmOffset -= Overflow;
3493   if ((int32_t)Overflow < 0) {
3494     Overflow += ImmOffset;
3495     ImmOffset = 0;
3496   }
3497 
3498   if (Overflow != 0) {
3499     if (!BaseReg) {
3500       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3501     } else {
3502       auto OverflowVal = B.buildConstant(S32, Overflow);
3503       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3504     }
3505   }
3506 
3507   if (!BaseReg)
3508     BaseReg = B.buildConstant(S32, 0).getReg(0);
3509 
3510   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3511 }
3512 
3513 /// Handle register layout difference for f16 images for some subtargets.
3514 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3515                                              MachineRegisterInfo &MRI,
3516                                              Register Reg,
3517                                              bool ImageStore) const {
3518   const LLT S16 = LLT::scalar(16);
3519   const LLT S32 = LLT::scalar(32);
3520   LLT StoreVT = MRI.getType(Reg);
3521   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3522 
3523   if (ST.hasUnpackedD16VMem()) {
3524     auto Unmerge = B.buildUnmerge(S16, Reg);
3525 
3526     SmallVector<Register, 4> WideRegs;
3527     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3528       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3529 
3530     int NumElts = StoreVT.getNumElements();
3531 
3532     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3533   }
3534 
3535   if (ImageStore && ST.hasImageStoreD16Bug()) {
3536     if (StoreVT.getNumElements() == 2) {
3537       SmallVector<Register, 4> PackedRegs;
3538       Reg = B.buildBitcast(S32, Reg).getReg(0);
3539       PackedRegs.push_back(Reg);
3540       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3541       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3542     }
3543 
3544     if (StoreVT.getNumElements() == 3) {
3545       SmallVector<Register, 4> PackedRegs;
3546       auto Unmerge = B.buildUnmerge(S16, Reg);
3547       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3548         PackedRegs.push_back(Unmerge.getReg(I));
3549       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3550       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3551       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3552     }
3553 
3554     if (StoreVT.getNumElements() == 4) {
3555       SmallVector<Register, 4> PackedRegs;
3556       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3557       auto Unmerge = B.buildUnmerge(S32, Reg);
3558       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3559         PackedRegs.push_back(Unmerge.getReg(I));
3560       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3561       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3562     }
3563 
3564     llvm_unreachable("invalid data type");
3565   }
3566 
3567   return Reg;
3568 }
3569 
3570 Register AMDGPULegalizerInfo::fixStoreSourceType(
3571   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3572   MachineRegisterInfo *MRI = B.getMRI();
3573   LLT Ty = MRI->getType(VData);
3574 
3575   const LLT S16 = LLT::scalar(16);
3576 
3577   // Fixup illegal register types for i8 stores.
3578   if (Ty == LLT::scalar(8) || Ty == S16) {
3579     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3580     return AnyExt;
3581   }
3582 
3583   if (Ty.isVector()) {
3584     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3585       if (IsFormat)
3586         return handleD16VData(B, *MRI, VData);
3587     }
3588   }
3589 
3590   return VData;
3591 }
3592 
3593 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3594                                               MachineRegisterInfo &MRI,
3595                                               MachineIRBuilder &B,
3596                                               bool IsTyped,
3597                                               bool IsFormat) const {
3598   Register VData = MI.getOperand(1).getReg();
3599   LLT Ty = MRI.getType(VData);
3600   LLT EltTy = Ty.getScalarType();
3601   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3602   const LLT S32 = LLT::scalar(32);
3603 
3604   VData = fixStoreSourceType(B, VData, IsFormat);
3605   Register RSrc = MI.getOperand(2).getReg();
3606 
3607   MachineMemOperand *MMO = *MI.memoperands_begin();
3608   const int MemSize = MMO->getSize();
3609 
3610   unsigned ImmOffset;
3611   unsigned TotalOffset;
3612 
3613   // The typed intrinsics add an immediate after the registers.
3614   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3615 
3616   // The struct intrinsic variants add one additional operand over raw.
3617   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3618   Register VIndex;
3619   int OpOffset = 0;
3620   if (HasVIndex) {
3621     VIndex = MI.getOperand(3).getReg();
3622     OpOffset = 1;
3623   }
3624 
3625   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3626   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3627 
3628   unsigned Format = 0;
3629   if (IsTyped) {
3630     Format = MI.getOperand(5 + OpOffset).getImm();
3631     ++OpOffset;
3632   }
3633 
3634   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3635 
3636   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3637   if (TotalOffset != 0)
3638     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3639 
3640   unsigned Opc;
3641   if (IsTyped) {
3642     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3643                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3644   } else if (IsFormat) {
3645     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3646                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3647   } else {
3648     switch (MemSize) {
3649     case 1:
3650       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3651       break;
3652     case 2:
3653       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3654       break;
3655     default:
3656       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3657       break;
3658     }
3659   }
3660 
3661   if (!VIndex)
3662     VIndex = B.buildConstant(S32, 0).getReg(0);
3663 
3664   auto MIB = B.buildInstr(Opc)
3665     .addUse(VData)              // vdata
3666     .addUse(RSrc)               // rsrc
3667     .addUse(VIndex)             // vindex
3668     .addUse(VOffset)            // voffset
3669     .addUse(SOffset)            // soffset
3670     .addImm(ImmOffset);         // offset(imm)
3671 
3672   if (IsTyped)
3673     MIB.addImm(Format);
3674 
3675   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3676      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3677      .addMemOperand(MMO);
3678 
3679   MI.eraseFromParent();
3680   return true;
3681 }
3682 
3683 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3684                                              MachineRegisterInfo &MRI,
3685                                              MachineIRBuilder &B,
3686                                              bool IsFormat,
3687                                              bool IsTyped) const {
3688   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3689   MachineMemOperand *MMO = *MI.memoperands_begin();
3690   const int MemSize = MMO->getSize();
3691   const LLT S32 = LLT::scalar(32);
3692 
3693   Register Dst = MI.getOperand(0).getReg();
3694   Register RSrc = MI.getOperand(2).getReg();
3695 
3696   // The typed intrinsics add an immediate after the registers.
3697   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3698 
3699   // The struct intrinsic variants add one additional operand over raw.
3700   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3701   Register VIndex;
3702   int OpOffset = 0;
3703   if (HasVIndex) {
3704     VIndex = MI.getOperand(3).getReg();
3705     OpOffset = 1;
3706   }
3707 
3708   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3709   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3710 
3711   unsigned Format = 0;
3712   if (IsTyped) {
3713     Format = MI.getOperand(5 + OpOffset).getImm();
3714     ++OpOffset;
3715   }
3716 
3717   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3718   unsigned ImmOffset;
3719   unsigned TotalOffset;
3720 
3721   LLT Ty = MRI.getType(Dst);
3722   LLT EltTy = Ty.getScalarType();
3723   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3724   const bool Unpacked = ST.hasUnpackedD16VMem();
3725 
3726   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3727   if (TotalOffset != 0)
3728     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3729 
3730   unsigned Opc;
3731 
3732   if (IsTyped) {
3733     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3734                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3735   } else if (IsFormat) {
3736     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3737                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3738   } else {
3739     switch (MemSize) {
3740     case 1:
3741       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3742       break;
3743     case 2:
3744       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3745       break;
3746     default:
3747       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3748       break;
3749     }
3750   }
3751 
3752   Register LoadDstReg;
3753 
3754   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3755   LLT UnpackedTy = Ty.changeElementSize(32);
3756 
3757   if (IsExtLoad)
3758     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3759   else if (Unpacked && IsD16 && Ty.isVector())
3760     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3761   else
3762     LoadDstReg = Dst;
3763 
3764   if (!VIndex)
3765     VIndex = B.buildConstant(S32, 0).getReg(0);
3766 
3767   auto MIB = B.buildInstr(Opc)
3768     .addDef(LoadDstReg)         // vdata
3769     .addUse(RSrc)               // rsrc
3770     .addUse(VIndex)             // vindex
3771     .addUse(VOffset)            // voffset
3772     .addUse(SOffset)            // soffset
3773     .addImm(ImmOffset);         // offset(imm)
3774 
3775   if (IsTyped)
3776     MIB.addImm(Format);
3777 
3778   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3779      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3780      .addMemOperand(MMO);
3781 
3782   if (LoadDstReg != Dst) {
3783     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3784 
3785     // Widen result for extending loads was widened.
3786     if (IsExtLoad)
3787       B.buildTrunc(Dst, LoadDstReg);
3788     else {
3789       // Repack to original 16-bit vector result
3790       // FIXME: G_TRUNC should work, but legalization currently fails
3791       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3792       SmallVector<Register, 4> Repack;
3793       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3794         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3795       B.buildMerge(Dst, Repack);
3796     }
3797   }
3798 
3799   MI.eraseFromParent();
3800   return true;
3801 }
3802 
3803 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3804                                                MachineIRBuilder &B,
3805                                                bool IsInc) const {
3806   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3807                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3808   B.buildInstr(Opc)
3809     .addDef(MI.getOperand(0).getReg())
3810     .addUse(MI.getOperand(2).getReg())
3811     .addUse(MI.getOperand(3).getReg())
3812     .cloneMemRefs(MI);
3813   MI.eraseFromParent();
3814   return true;
3815 }
3816 
3817 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3818   switch (IntrID) {
3819   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3820   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3821     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3822   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3823   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3824     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3825   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3826   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3827     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3828   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3829   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3830     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3831   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3832   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3833     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3834   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3835   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3836     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3837   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3838   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3839     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3840   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3841   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3842     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3843   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3844   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3845     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3846   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3847   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3848     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3849   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3850   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3851     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3852   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3853   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3854     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3855   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3856   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3857     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3858   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3859   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3860     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3861   default:
3862     llvm_unreachable("unhandled atomic opcode");
3863   }
3864 }
3865 
3866 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3867                                                MachineIRBuilder &B,
3868                                                Intrinsic::ID IID) const {
3869   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3870                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3871   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3872 
3873   Register Dst;
3874 
3875   int OpOffset = 0;
3876   if (HasReturn) {
3877     // A few FP atomics do not support return values.
3878     Dst = MI.getOperand(0).getReg();
3879   } else {
3880     OpOffset = -1;
3881   }
3882 
3883   Register VData = MI.getOperand(2 + OpOffset).getReg();
3884   Register CmpVal;
3885 
3886   if (IsCmpSwap) {
3887     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3888     ++OpOffset;
3889   }
3890 
3891   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3892   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3893 
3894   // The struct intrinsic variants add one additional operand over raw.
3895   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3896   Register VIndex;
3897   if (HasVIndex) {
3898     VIndex = MI.getOperand(4 + OpOffset).getReg();
3899     ++OpOffset;
3900   }
3901 
3902   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3903   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3904   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3905 
3906   MachineMemOperand *MMO = *MI.memoperands_begin();
3907 
3908   unsigned ImmOffset;
3909   unsigned TotalOffset;
3910   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3911   if (TotalOffset != 0)
3912     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3913 
3914   if (!VIndex)
3915     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3916 
3917   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3918 
3919   if (HasReturn)
3920     MIB.addDef(Dst);
3921 
3922   MIB.addUse(VData); // vdata
3923 
3924   if (IsCmpSwap)
3925     MIB.addReg(CmpVal);
3926 
3927   MIB.addUse(RSrc)               // rsrc
3928      .addUse(VIndex)             // vindex
3929      .addUse(VOffset)            // voffset
3930      .addUse(SOffset)            // soffset
3931      .addImm(ImmOffset)          // offset(imm)
3932      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3933      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3934      .addMemOperand(MMO);
3935 
3936   MI.eraseFromParent();
3937   return true;
3938 }
3939 
3940 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3941 /// vector with s16 typed elements.
3942 static void packImageA16AddressToDwords(
3943     MachineIRBuilder &B, MachineInstr &MI,
3944     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3945     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3946   const LLT S16 = LLT::scalar(16);
3947   const LLT V2S16 = LLT::vector(2, 16);
3948 
3949   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3950     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3951     if (!SrcOp.isReg())
3952       continue; // _L to _LZ may have eliminated this.
3953 
3954     Register AddrReg = SrcOp.getReg();
3955 
3956     if (I < Intr->GradientStart) {
3957       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3958       PackedAddrs.push_back(AddrReg);
3959     } else {
3960       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3961       // derivatives dx/dh and dx/dv are packed with undef.
3962       if (((I + 1) >= EndIdx) ||
3963           ((Intr->NumGradients / 2) % 2 == 1 &&
3964            (I == static_cast<unsigned>(Intr->GradientStart +
3965                                        (Intr->NumGradients / 2) - 1) ||
3966             I == static_cast<unsigned>(Intr->GradientStart +
3967                                        Intr->NumGradients - 1))) ||
3968           // Check for _L to _LZ optimization
3969           !MI.getOperand(ArgOffset + I + 1).isReg()) {
3970         PackedAddrs.push_back(
3971             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3972                 .getReg(0));
3973       } else {
3974         PackedAddrs.push_back(
3975             B.buildBuildVector(
3976                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3977                 .getReg(0));
3978         ++I;
3979       }
3980     }
3981   }
3982 }
3983 
3984 /// Convert from separate vaddr components to a single vector address register,
3985 /// and replace the remaining operands with $noreg.
3986 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3987                                      int DimIdx, int NumVAddrs) {
3988   const LLT S32 = LLT::scalar(32);
3989 
3990   SmallVector<Register, 8> AddrRegs;
3991   for (int I = 0; I != NumVAddrs; ++I) {
3992     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3993     if (SrcOp.isReg()) {
3994       AddrRegs.push_back(SrcOp.getReg());
3995       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3996     }
3997   }
3998 
3999   int NumAddrRegs = AddrRegs.size();
4000   if (NumAddrRegs != 1) {
4001     // Round up to 8 elements for v5-v7
4002     // FIXME: Missing intermediate sized register classes and instructions.
4003     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4004       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4005       auto Undef = B.buildUndef(S32);
4006       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4007       NumAddrRegs = RoundedNumRegs;
4008     }
4009 
4010     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4011     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4012   }
4013 
4014   for (int I = 1; I != NumVAddrs; ++I) {
4015     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4016     if (SrcOp.isReg())
4017       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4018   }
4019 }
4020 
4021 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4022 ///
4023 /// Depending on the subtarget, load/store with 16-bit element data need to be
4024 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4025 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4026 /// registers.
4027 ///
4028 /// We don't want to directly select image instructions just yet, but also want
4029 /// to exposes all register repacking to the legalizer/combiners. We also don't
4030 /// want a selected instrution entering RegBankSelect. In order to avoid
4031 /// defining a multitude of intermediate image instructions, directly hack on
4032 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4033 /// now unnecessary arguments with $noreg.
4034 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4035     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4036     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4037 
4038   const unsigned NumDefs = MI.getNumExplicitDefs();
4039   const unsigned ArgOffset = NumDefs + 1;
4040   bool IsTFE = NumDefs == 2;
4041   // We are only processing the operands of d16 image operations on subtargets
4042   // that use the unpacked register layout, or need to repack the TFE result.
4043 
4044   // TODO: Do we need to guard against already legalized intrinsics?
4045   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4046       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4047 
4048   MachineRegisterInfo *MRI = B.getMRI();
4049   const LLT S32 = LLT::scalar(32);
4050   const LLT S16 = LLT::scalar(16);
4051   const LLT V2S16 = LLT::vector(2, 16);
4052 
4053   unsigned DMask = 0;
4054 
4055   // Check for 16 bit addresses and pack if true.
4056   LLT GradTy =
4057       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4058   LLT AddrTy =
4059       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4060   const bool IsG16 = GradTy == S16;
4061   const bool IsA16 = AddrTy == S16;
4062 
4063   int DMaskLanes = 0;
4064   if (!BaseOpcode->Atomic) {
4065     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4066     if (BaseOpcode->Gather4) {
4067       DMaskLanes = 4;
4068     } else if (DMask != 0) {
4069       DMaskLanes = countPopulation(DMask);
4070     } else if (!IsTFE && !BaseOpcode->Store) {
4071       // If dmask is 0, this is a no-op load. This can be eliminated.
4072       B.buildUndef(MI.getOperand(0));
4073       MI.eraseFromParent();
4074       return true;
4075     }
4076   }
4077 
4078   Observer.changingInstr(MI);
4079   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4080 
4081   unsigned NewOpcode = NumDefs == 0 ?
4082     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4083 
4084   // Track that we legalized this
4085   MI.setDesc(B.getTII().get(NewOpcode));
4086 
4087   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4088   // dmask to be at least 1 otherwise the instruction will fail
4089   if (IsTFE && DMask == 0) {
4090     DMask = 0x1;
4091     DMaskLanes = 1;
4092     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4093   }
4094 
4095   if (BaseOpcode->Atomic) {
4096     Register VData0 = MI.getOperand(2).getReg();
4097     LLT Ty = MRI->getType(VData0);
4098 
4099     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4100     if (Ty.isVector())
4101       return false;
4102 
4103     if (BaseOpcode->AtomicX2) {
4104       Register VData1 = MI.getOperand(3).getReg();
4105       // The two values are packed in one register.
4106       LLT PackedTy = LLT::vector(2, Ty);
4107       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4108       MI.getOperand(2).setReg(Concat.getReg(0));
4109       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4110     }
4111   }
4112 
4113   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4114 
4115   // Optimize _L to _LZ when _L is zero
4116   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4117           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4118     const ConstantFP *ConstantLod;
4119 
4120     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4121                  m_GFCst(ConstantLod))) {
4122       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4123         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4124         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4125             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4126                                                       Intr->Dim);
4127 
4128         // The starting indexes should remain in the same place.
4129         --CorrectedNumVAddrs;
4130 
4131         MI.getOperand(MI.getNumExplicitDefs())
4132             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4133         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4134         Intr = NewImageDimIntr;
4135       }
4136     }
4137   }
4138 
4139   // Optimize _mip away, when 'lod' is zero
4140   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4141     int64_t ConstantLod;
4142     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4143                  m_ICst(ConstantLod))) {
4144       if (ConstantLod == 0) {
4145         // TODO: Change intrinsic opcode and remove operand instead or replacing
4146         // it with 0, as the _L to _LZ handling is done above.
4147         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4148         --CorrectedNumVAddrs;
4149       }
4150     }
4151   }
4152 
4153   // Rewrite the addressing register layout before doing anything else.
4154   if (IsA16 || IsG16) {
4155     if (IsA16) {
4156       // Target must support the feature and gradients need to be 16 bit too
4157       if (!ST.hasA16() || !IsG16)
4158         return false;
4159     } else if (!ST.hasG16())
4160       return false;
4161 
4162     if (Intr->NumVAddrs > 1) {
4163       SmallVector<Register, 4> PackedRegs;
4164       // Don't compress addresses for G16
4165       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4166       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4167                                   PackEndIdx);
4168 
4169       if (!IsA16) {
4170         // Add uncompressed address
4171         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4172           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4173           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4174           PackedRegs.push_back(AddrReg);
4175         }
4176       }
4177 
4178       // See also below in the non-a16 branch
4179       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4180 
4181       if (!UseNSA && PackedRegs.size() > 1) {
4182         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4183         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4184         PackedRegs[0] = Concat.getReg(0);
4185         PackedRegs.resize(1);
4186       }
4187 
4188       const unsigned NumPacked = PackedRegs.size();
4189       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4190         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4191         if (!SrcOp.isReg()) {
4192           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4193           continue;
4194         }
4195 
4196         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4197 
4198         if (I - Intr->VAddrStart < NumPacked)
4199           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4200         else
4201           SrcOp.setReg(AMDGPU::NoRegister);
4202       }
4203     }
4204   } else {
4205     // If the register allocator cannot place the address registers contiguously
4206     // without introducing moves, then using the non-sequential address encoding
4207     // is always preferable, since it saves VALU instructions and is usually a
4208     // wash in terms of code size or even better.
4209     //
4210     // However, we currently have no way of hinting to the register allocator
4211     // that MIMG addresses should be placed contiguously when it is possible to
4212     // do so, so force non-NSA for the common 2-address case as a heuristic.
4213     //
4214     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4215     // allocation when possible.
4216     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4217 
4218     if (!UseNSA && Intr->NumVAddrs > 1)
4219       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4220                                Intr->NumVAddrs);
4221   }
4222 
4223   int Flags = 0;
4224   if (IsA16)
4225     Flags |= 1;
4226   if (IsG16)
4227     Flags |= 2;
4228   MI.addOperand(MachineOperand::CreateImm(Flags));
4229 
4230   if (BaseOpcode->Store) { // No TFE for stores?
4231     // TODO: Handle dmask trim
4232     Register VData = MI.getOperand(1).getReg();
4233     LLT Ty = MRI->getType(VData);
4234     if (!Ty.isVector() || Ty.getElementType() != S16)
4235       return true;
4236 
4237     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4238     if (RepackedReg != VData) {
4239       MI.getOperand(1).setReg(RepackedReg);
4240     }
4241 
4242     return true;
4243   }
4244 
4245   Register DstReg = MI.getOperand(0).getReg();
4246   LLT Ty = MRI->getType(DstReg);
4247   const LLT EltTy = Ty.getScalarType();
4248   const bool IsD16 = Ty.getScalarType() == S16;
4249   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4250 
4251   // Confirm that the return type is large enough for the dmask specified
4252   if (NumElts < DMaskLanes)
4253     return false;
4254 
4255   if (NumElts > 4 || DMaskLanes > 4)
4256     return false;
4257 
4258   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4259   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4260 
4261   // The raw dword aligned data component of the load. The only legal cases
4262   // where this matters should be when using the packed D16 format, for
4263   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4264   LLT RoundedTy;
4265 
4266   // S32 vector to to cover all data, plus TFE result element.
4267   LLT TFETy;
4268 
4269   // Register type to use for each loaded component. Will be S32 or V2S16.
4270   LLT RegTy;
4271 
4272   if (IsD16 && ST.hasUnpackedD16VMem()) {
4273     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4274     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4275     RegTy = S32;
4276   } else {
4277     unsigned EltSize = EltTy.getSizeInBits();
4278     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4279     unsigned RoundedSize = 32 * RoundedElts;
4280     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4281     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4282     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4283   }
4284 
4285   // The return type does not need adjustment.
4286   // TODO: Should we change s16 case to s32 or <2 x s16>?
4287   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4288     return true;
4289 
4290   Register Dst1Reg;
4291 
4292   // Insert after the instruction.
4293   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4294 
4295   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4296   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4297   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4298   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4299 
4300   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4301 
4302   MI.getOperand(0).setReg(NewResultReg);
4303 
4304   // In the IR, TFE is supposed to be used with a 2 element struct return
4305   // type. The intruction really returns these two values in one contiguous
4306   // register, with one additional dword beyond the loaded data. Rewrite the
4307   // return type to use a single register result.
4308 
4309   if (IsTFE) {
4310     Dst1Reg = MI.getOperand(1).getReg();
4311     if (MRI->getType(Dst1Reg) != S32)
4312       return false;
4313 
4314     // TODO: Make sure the TFE operand bit is set.
4315     MI.RemoveOperand(1);
4316 
4317     // Handle the easy case that requires no repack instructions.
4318     if (Ty == S32) {
4319       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4320       return true;
4321     }
4322   }
4323 
4324   // Now figure out how to copy the new result register back into the old
4325   // result.
4326   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4327 
4328   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4329 
4330   if (ResultNumRegs == 1) {
4331     assert(!IsTFE);
4332     ResultRegs[0] = NewResultReg;
4333   } else {
4334     // We have to repack into a new vector of some kind.
4335     for (int I = 0; I != NumDataRegs; ++I)
4336       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4337     B.buildUnmerge(ResultRegs, NewResultReg);
4338 
4339     // Drop the final TFE element to get the data part. The TFE result is
4340     // directly written to the right place already.
4341     if (IsTFE)
4342       ResultRegs.resize(NumDataRegs);
4343   }
4344 
4345   // For an s16 scalar result, we form an s32 result with a truncate regardless
4346   // of packed vs. unpacked.
4347   if (IsD16 && !Ty.isVector()) {
4348     B.buildTrunc(DstReg, ResultRegs[0]);
4349     return true;
4350   }
4351 
4352   // Avoid a build/concat_vector of 1 entry.
4353   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4354     B.buildBitcast(DstReg, ResultRegs[0]);
4355     return true;
4356   }
4357 
4358   assert(Ty.isVector());
4359 
4360   if (IsD16) {
4361     // For packed D16 results with TFE enabled, all the data components are
4362     // S32. Cast back to the expected type.
4363     //
4364     // TODO: We don't really need to use load s32 elements. We would only need one
4365     // cast for the TFE result if a multiple of v2s16 was used.
4366     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4367       for (Register &Reg : ResultRegs)
4368         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4369     } else if (ST.hasUnpackedD16VMem()) {
4370       for (Register &Reg : ResultRegs)
4371         Reg = B.buildTrunc(S16, Reg).getReg(0);
4372     }
4373   }
4374 
4375   auto padWithUndef = [&](LLT Ty, int NumElts) {
4376     if (NumElts == 0)
4377       return;
4378     Register Undef = B.buildUndef(Ty).getReg(0);
4379     for (int I = 0; I != NumElts; ++I)
4380       ResultRegs.push_back(Undef);
4381   };
4382 
4383   // Pad out any elements eliminated due to the dmask.
4384   LLT ResTy = MRI->getType(ResultRegs[0]);
4385   if (!ResTy.isVector()) {
4386     padWithUndef(ResTy, NumElts - ResultRegs.size());
4387     B.buildBuildVector(DstReg, ResultRegs);
4388     return true;
4389   }
4390 
4391   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4392   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4393 
4394   // Deal with the one annoying legal case.
4395   const LLT V3S16 = LLT::vector(3, 16);
4396   if (Ty == V3S16) {
4397     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4398     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4399     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4400     return true;
4401   }
4402 
4403   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4404   B.buildConcatVectors(DstReg, ResultRegs);
4405   return true;
4406 }
4407 
4408 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4409   LegalizerHelper &Helper, MachineInstr &MI) const {
4410   MachineIRBuilder &B = Helper.MIRBuilder;
4411   GISelChangeObserver &Observer = Helper.Observer;
4412 
4413   Register Dst = MI.getOperand(0).getReg();
4414   LLT Ty = B.getMRI()->getType(Dst);
4415   unsigned Size = Ty.getSizeInBits();
4416   MachineFunction &MF = B.getMF();
4417 
4418   Observer.changingInstr(MI);
4419 
4420   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4421     Ty = getBitcastRegisterType(Ty);
4422     Helper.bitcastDst(MI, Ty, 0);
4423     Dst = MI.getOperand(0).getReg();
4424     B.setInsertPt(B.getMBB(), MI);
4425   }
4426 
4427   // FIXME: We don't really need this intermediate instruction. The intrinsic
4428   // should be fixed to have a memory operand. Since it's readnone, we're not
4429   // allowed to add one.
4430   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4431   MI.RemoveOperand(1); // Remove intrinsic ID
4432 
4433   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4434   // TODO: Should this use datalayout alignment?
4435   const unsigned MemSize = (Size + 7) / 8;
4436   const Align MemAlign(4);
4437   MachineMemOperand *MMO = MF.getMachineMemOperand(
4438       MachinePointerInfo(),
4439       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4440           MachineMemOperand::MOInvariant,
4441       MemSize, MemAlign);
4442   MI.addMemOperand(MF, MMO);
4443 
4444   // There are no 96-bit result scalar loads, but widening to 128-bit should
4445   // always be legal. We may need to restore this to a 96-bit result if it turns
4446   // out this needs to be converted to a vector load during RegBankSelect.
4447   if (!isPowerOf2_32(Size)) {
4448     if (Ty.isVector())
4449       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4450     else
4451       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4452   }
4453 
4454   Observer.changedInstr(MI);
4455   return true;
4456 }
4457 
4458 // TODO: Move to selection
4459 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4460                                                 MachineRegisterInfo &MRI,
4461                                                 MachineIRBuilder &B) const {
4462   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4463   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4464       !ST.isTrapHandlerEnabled()) {
4465     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4466   } else {
4467     // Pass queue pointer to trap handler as input, and insert trap instruction
4468     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4469     MachineRegisterInfo &MRI = *B.getMRI();
4470 
4471     Register LiveIn =
4472       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4473     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4474       return false;
4475 
4476     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4477     B.buildCopy(SGPR01, LiveIn);
4478     B.buildInstr(AMDGPU::S_TRAP)
4479         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4480         .addReg(SGPR01, RegState::Implicit);
4481   }
4482 
4483   MI.eraseFromParent();
4484   return true;
4485 }
4486 
4487 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4488     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4489   // Is non-HSA path or trap-handler disabled? then, report a warning
4490   // accordingly
4491   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4492       !ST.isTrapHandlerEnabled()) {
4493     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4494                                      "debugtrap handler not supported",
4495                                      MI.getDebugLoc(), DS_Warning);
4496     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4497     Ctx.diagnose(NoTrap);
4498   } else {
4499     // Insert debug-trap instruction
4500     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4501   }
4502 
4503   MI.eraseFromParent();
4504   return true;
4505 }
4506 
4507 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4508                                                MachineIRBuilder &B) const {
4509   MachineRegisterInfo &MRI = *B.getMRI();
4510   const LLT S16 = LLT::scalar(16);
4511   const LLT S32 = LLT::scalar(32);
4512 
4513   Register DstReg = MI.getOperand(0).getReg();
4514   Register NodePtr = MI.getOperand(2).getReg();
4515   Register RayExtent = MI.getOperand(3).getReg();
4516   Register RayOrigin = MI.getOperand(4).getReg();
4517   Register RayDir = MI.getOperand(5).getReg();
4518   Register RayInvDir = MI.getOperand(6).getReg();
4519   Register TDescr = MI.getOperand(7).getReg();
4520 
4521   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4522   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4523   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4524                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4525                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4526                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4527 
4528   SmallVector<Register, 12> Ops;
4529   if (Is64) {
4530     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4531     Ops.push_back(Unmerge.getReg(0));
4532     Ops.push_back(Unmerge.getReg(1));
4533   } else {
4534     Ops.push_back(NodePtr);
4535   }
4536   Ops.push_back(RayExtent);
4537 
4538   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4539     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4540     Ops.push_back(Unmerge.getReg(0));
4541     Ops.push_back(Unmerge.getReg(1));
4542     Ops.push_back(Unmerge.getReg(2));
4543   };
4544 
4545   packLanes(RayOrigin);
4546   if (IsA16) {
4547     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4548     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4549     Register R1 = MRI.createGenericVirtualRegister(S32);
4550     Register R2 = MRI.createGenericVirtualRegister(S32);
4551     Register R3 = MRI.createGenericVirtualRegister(S32);
4552     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4553     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4554     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4555     Ops.push_back(R1);
4556     Ops.push_back(R2);
4557     Ops.push_back(R3);
4558   } else {
4559     packLanes(RayDir);
4560     packLanes(RayInvDir);
4561   }
4562 
4563   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4564     .addDef(DstReg)
4565     .addImm(Opcode);
4566 
4567   for (Register R : Ops) {
4568     MIB.addUse(R);
4569   }
4570 
4571   MIB.addUse(TDescr)
4572      .addImm(IsA16 ? 1 : 0)
4573      .cloneMemRefs(MI);
4574 
4575   MI.eraseFromParent();
4576   return true;
4577 }
4578 
4579 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4580                                             MachineInstr &MI) const {
4581   MachineIRBuilder &B = Helper.MIRBuilder;
4582   MachineRegisterInfo &MRI = *B.getMRI();
4583 
4584   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4585   auto IntrID = MI.getIntrinsicID();
4586   switch (IntrID) {
4587   case Intrinsic::amdgcn_if:
4588   case Intrinsic::amdgcn_else: {
4589     MachineInstr *Br = nullptr;
4590     MachineBasicBlock *UncondBrTarget = nullptr;
4591     bool Negated = false;
4592     if (MachineInstr *BrCond =
4593             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4594       const SIRegisterInfo *TRI
4595         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4596 
4597       Register Def = MI.getOperand(1).getReg();
4598       Register Use = MI.getOperand(3).getReg();
4599 
4600       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4601 
4602       if (Negated)
4603         std::swap(CondBrTarget, UncondBrTarget);
4604 
4605       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4606       if (IntrID == Intrinsic::amdgcn_if) {
4607         B.buildInstr(AMDGPU::SI_IF)
4608           .addDef(Def)
4609           .addUse(Use)
4610           .addMBB(UncondBrTarget);
4611       } else {
4612         B.buildInstr(AMDGPU::SI_ELSE)
4613             .addDef(Def)
4614             .addUse(Use)
4615             .addMBB(UncondBrTarget);
4616       }
4617 
4618       if (Br) {
4619         Br->getOperand(0).setMBB(CondBrTarget);
4620       } else {
4621         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4622         // since we're swapping branch targets it needs to be reinserted.
4623         // FIXME: IRTranslator should probably not do this
4624         B.buildBr(*CondBrTarget);
4625       }
4626 
4627       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4628       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4629       MI.eraseFromParent();
4630       BrCond->eraseFromParent();
4631       return true;
4632     }
4633 
4634     return false;
4635   }
4636   case Intrinsic::amdgcn_loop: {
4637     MachineInstr *Br = nullptr;
4638     MachineBasicBlock *UncondBrTarget = nullptr;
4639     bool Negated = false;
4640     if (MachineInstr *BrCond =
4641             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4642       const SIRegisterInfo *TRI
4643         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4644 
4645       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4646       Register Reg = MI.getOperand(2).getReg();
4647 
4648       if (Negated)
4649         std::swap(CondBrTarget, UncondBrTarget);
4650 
4651       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4652       B.buildInstr(AMDGPU::SI_LOOP)
4653         .addUse(Reg)
4654         .addMBB(UncondBrTarget);
4655 
4656       if (Br)
4657         Br->getOperand(0).setMBB(CondBrTarget);
4658       else
4659         B.buildBr(*CondBrTarget);
4660 
4661       MI.eraseFromParent();
4662       BrCond->eraseFromParent();
4663       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4664       return true;
4665     }
4666 
4667     return false;
4668   }
4669   case Intrinsic::amdgcn_kernarg_segment_ptr:
4670     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4671       // This only makes sense to call in a kernel, so just lower to null.
4672       B.buildConstant(MI.getOperand(0).getReg(), 0);
4673       MI.eraseFromParent();
4674       return true;
4675     }
4676 
4677     return legalizePreloadedArgIntrin(
4678       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4679   case Intrinsic::amdgcn_implicitarg_ptr:
4680     return legalizeImplicitArgPtr(MI, MRI, B);
4681   case Intrinsic::amdgcn_workitem_id_x:
4682     return legalizePreloadedArgIntrin(MI, MRI, B,
4683                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4684   case Intrinsic::amdgcn_workitem_id_y:
4685     return legalizePreloadedArgIntrin(MI, MRI, B,
4686                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4687   case Intrinsic::amdgcn_workitem_id_z:
4688     return legalizePreloadedArgIntrin(MI, MRI, B,
4689                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4690   case Intrinsic::amdgcn_workgroup_id_x:
4691     return legalizePreloadedArgIntrin(MI, MRI, B,
4692                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4693   case Intrinsic::amdgcn_workgroup_id_y:
4694     return legalizePreloadedArgIntrin(MI, MRI, B,
4695                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4696   case Intrinsic::amdgcn_workgroup_id_z:
4697     return legalizePreloadedArgIntrin(MI, MRI, B,
4698                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4699   case Intrinsic::amdgcn_dispatch_ptr:
4700     return legalizePreloadedArgIntrin(MI, MRI, B,
4701                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4702   case Intrinsic::amdgcn_queue_ptr:
4703     return legalizePreloadedArgIntrin(MI, MRI, B,
4704                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4705   case Intrinsic::amdgcn_implicit_buffer_ptr:
4706     return legalizePreloadedArgIntrin(
4707       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4708   case Intrinsic::amdgcn_dispatch_id:
4709     return legalizePreloadedArgIntrin(MI, MRI, B,
4710                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4711   case Intrinsic::amdgcn_fdiv_fast:
4712     return legalizeFDIVFastIntrin(MI, MRI, B);
4713   case Intrinsic::amdgcn_is_shared:
4714     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4715   case Intrinsic::amdgcn_is_private:
4716     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4717   case Intrinsic::amdgcn_wavefrontsize: {
4718     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4719     MI.eraseFromParent();
4720     return true;
4721   }
4722   case Intrinsic::amdgcn_s_buffer_load:
4723     return legalizeSBufferLoad(Helper, MI);
4724   case Intrinsic::amdgcn_raw_buffer_store:
4725   case Intrinsic::amdgcn_struct_buffer_store:
4726     return legalizeBufferStore(MI, MRI, B, false, false);
4727   case Intrinsic::amdgcn_raw_buffer_store_format:
4728   case Intrinsic::amdgcn_struct_buffer_store_format:
4729     return legalizeBufferStore(MI, MRI, B, false, true);
4730   case Intrinsic::amdgcn_raw_tbuffer_store:
4731   case Intrinsic::amdgcn_struct_tbuffer_store:
4732     return legalizeBufferStore(MI, MRI, B, true, true);
4733   case Intrinsic::amdgcn_raw_buffer_load:
4734   case Intrinsic::amdgcn_struct_buffer_load:
4735     return legalizeBufferLoad(MI, MRI, B, false, false);
4736   case Intrinsic::amdgcn_raw_buffer_load_format:
4737   case Intrinsic::amdgcn_struct_buffer_load_format:
4738     return legalizeBufferLoad(MI, MRI, B, true, false);
4739   case Intrinsic::amdgcn_raw_tbuffer_load:
4740   case Intrinsic::amdgcn_struct_tbuffer_load:
4741     return legalizeBufferLoad(MI, MRI, B, true, true);
4742   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4743   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4744   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4745   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4746   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4747   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4748   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4749   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4750   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4751   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4752   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4753   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4754   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4755   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4756   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4757   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4758   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4759   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4760   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4761   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4762   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4763   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4764   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4765   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4766   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4767   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4768   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4769   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4770     return legalizeBufferAtomic(MI, B, IntrID);
4771   case Intrinsic::amdgcn_atomic_inc:
4772     return legalizeAtomicIncDec(MI, B, true);
4773   case Intrinsic::amdgcn_atomic_dec:
4774     return legalizeAtomicIncDec(MI, B, false);
4775   case Intrinsic::trap:
4776     return legalizeTrapIntrinsic(MI, MRI, B);
4777   case Intrinsic::debugtrap:
4778     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4779   case Intrinsic::amdgcn_rsq_clamp:
4780     return legalizeRsqClampIntrinsic(MI, MRI, B);
4781   case Intrinsic::amdgcn_ds_fadd:
4782   case Intrinsic::amdgcn_ds_fmin:
4783   case Intrinsic::amdgcn_ds_fmax:
4784     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4785   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4786     return legalizeBVHIntrinsic(MI, B);
4787   default: {
4788     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4789             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4790       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4791     return true;
4792   }
4793   }
4794 
4795   return true;
4796 }
4797