1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/CodeGen/ValueTypes.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 
32 #define DEBUG_TYPE "amdgpu-legalinfo"
33 
34 using namespace llvm;
35 using namespace LegalizeActions;
36 using namespace LegalizeMutations;
37 using namespace LegalityPredicates;
38 using namespace MIPatternMatch;
39 
40 // Hack until load/store selection patterns support any tuple of legal types.
41 static cl::opt<bool> EnableNewLegality(
42   "amdgpu-global-isel-new-legality",
43   cl::desc("Use GlobalISel desired legality, rather than try to use"
44            "rules compatible with selection patterns"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 static constexpr unsigned MaxRegisterSize = 1024;
49 
50 // Round the number of elements to the next power of two elements
51 static LLT getPow2VectorType(LLT Ty) {
52   unsigned NElts = Ty.getNumElements();
53   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
54   return Ty.changeNumElements(Pow2NElts);
55 }
56 
57 // Round the number of bits to the next power of two bits
58 static LLT getPow2ScalarType(LLT Ty) {
59   unsigned Bits = Ty.getSizeInBits();
60   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
61   return LLT::scalar(Pow2Bits);
62 }
63 
64 /// \returs true if this is an odd sized vector which should widen by adding an
65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66 /// excludes s1 vectors, which should always be scalarized.
67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     if (!Ty.isVector())
71       return false;
72 
73     const LLT EltTy = Ty.getElementType();
74     const unsigned EltSize = EltTy.getSizeInBits();
75     return Ty.getNumElements() % 2 != 0 &&
76            EltSize > 1 && EltSize < 32 &&
77            Ty.getSizeInBits() % 32 != 0;
78   };
79 }
80 
81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     return Ty.getSizeInBits() % 32 == 0;
85   };
86 }
87 
88 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getScalarType();
92     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
93   };
94 }
95 
96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
112   };
113 }
114 
115 // Increase the number of vector elements to reach the next multiple of 32-bit
116 // type.
117 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
118   return [=](const LegalityQuery &Query) {
119     const LLT Ty = Query.Types[TypeIdx];
120 
121     const LLT EltTy = Ty.getElementType();
122     const int Size = Ty.getSizeInBits();
123     const int EltSize = EltTy.getSizeInBits();
124     const int NextMul32 = (Size + 31) / 32;
125 
126     assert(EltSize < 32);
127 
128     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
129     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
130   };
131 }
132 
133 static LLT getBitcastRegisterType(const LLT Ty) {
134   const unsigned Size = Ty.getSizeInBits();
135 
136   LLT CoercedTy;
137   if (Size <= 32) {
138     // <2 x s8> -> s16
139     // <4 x s8> -> s32
140     return LLT::scalar(Size);
141   }
142 
143   return LLT::scalarOrVector(Size / 32, 32);
144 }
145 
146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147   return [=](const LegalityQuery &Query) {
148     const LLT Ty = Query.Types[TypeIdx];
149     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
150   };
151 }
152 
153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     unsigned Size = Ty.getSizeInBits();
157     assert(Size % 32 == 0);
158     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
159   };
160 }
161 
162 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
163   return [=](const LegalityQuery &Query) {
164     const LLT QueryTy = Query.Types[TypeIdx];
165     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
166   };
167 }
168 
169 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
170   return [=](const LegalityQuery &Query) {
171     const LLT QueryTy = Query.Types[TypeIdx];
172     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
173   };
174 }
175 
176 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
177   return [=](const LegalityQuery &Query) {
178     const LLT QueryTy = Query.Types[TypeIdx];
179     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
180   };
181 }
182 
183 static bool isRegisterSize(unsigned Size) {
184   return Size % 32 == 0 && Size <= MaxRegisterSize;
185 }
186 
187 static bool isRegisterVectorElementType(LLT EltTy) {
188   const int EltSize = EltTy.getSizeInBits();
189   return EltSize == 16 || EltSize % 32 == 0;
190 }
191 
192 static bool isRegisterVectorType(LLT Ty) {
193   const int EltSize = Ty.getElementType().getSizeInBits();
194   return EltSize == 32 || EltSize == 64 ||
195          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
196          EltSize == 128 || EltSize == 256;
197 }
198 
199 static bool isRegisterType(LLT Ty) {
200   if (!isRegisterSize(Ty.getSizeInBits()))
201     return false;
202 
203   if (Ty.isVector())
204     return isRegisterVectorType(Ty);
205 
206   return true;
207 }
208 
209 // Any combination of 32 or 64-bit elements up the maximum register size, and
210 // multiples of v2s16.
211 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
212   return [=](const LegalityQuery &Query) {
213     return isRegisterType(Query.Types[TypeIdx]);
214   };
215 }
216 
217 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
218   return [=](const LegalityQuery &Query) {
219     const LLT QueryTy = Query.Types[TypeIdx];
220     if (!QueryTy.isVector())
221       return false;
222     const LLT EltTy = QueryTy.getElementType();
223     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
224   };
225 }
226 
227 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query,
265                                  unsigned Opcode) {
266   const LLT Ty = Query.Types[0];
267 
268   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
269   const bool IsLoad = Opcode != AMDGPU::G_STORE;
270 
271   unsigned RegSize = Ty.getSizeInBits();
272   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
273   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
274   unsigned AS = Query.Types[1].getAddressSpace();
275 
276   // All of these need to be custom lowered to cast the pointer operand.
277   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
278     return false;
279 
280   // TODO: We should be able to widen loads if the alignment is high enough, but
281   // we also need to modify the memory access size.
282 #if 0
283   // Accept widening loads based on alignment.
284   if (IsLoad && MemSize < Size)
285     MemSize = std::max(MemSize, Align);
286 #endif
287 
288   // Only 1-byte and 2-byte to 32-bit extloads are valid.
289   if (MemSize != RegSize && RegSize != 32)
290     return false;
291 
292   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
293     return false;
294 
295   switch (MemSize) {
296   case 8:
297   case 16:
298   case 32:
299   case 64:
300   case 128:
301     break;
302   case 96:
303     if (!ST.hasDwordx3LoadStores())
304       return false;
305     break;
306   case 256:
307   case 512:
308     // These may contextually need to be broken down.
309     break;
310   default:
311     return false;
312   }
313 
314   assert(RegSize >= MemSize);
315 
316   if (AlignBits < MemSize) {
317     const SITargetLowering *TLI = ST.getTargetLowering();
318     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
319                                                  Align(AlignBits / 8)))
320       return false;
321   }
322 
323   return true;
324 }
325 
326 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
327 // workaround this. Eventually it should ignore the type for loads and only care
328 // about the size. Return true in cases where we will workaround this for now by
329 // bitcasting.
330 static bool loadStoreBitcastWorkaround(const LLT Ty) {
331   if (EnableNewLegality)
332     return false;
333 
334   const unsigned Size = Ty.getSizeInBits();
335   if (Size <= 64)
336     return false;
337   if (!Ty.isVector())
338     return true;
339 
340   LLT EltTy = Ty.getElementType();
341   if (EltTy.isPointer())
342     return true;
343 
344   unsigned EltSize = EltTy.getSizeInBits();
345   return EltSize != 32 && EltSize != 64;
346 }
347 
348 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
349                              unsigned Opcode) {
350   const LLT Ty = Query.Types[0];
351   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
352          !loadStoreBitcastWorkaround(Ty);
353 }
354 
355 /// Return true if a load or store of the type should be lowered with a bitcast
356 /// to a different type.
357 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
358                                        const unsigned MemSizeInBits) {
359   const unsigned Size = Ty.getSizeInBits();
360     if (Size != MemSizeInBits)
361       return Size <= 32 && Ty.isVector();
362 
363   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
364     return true;
365   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
366          !isRegisterVectorElementType(Ty.getElementType());
367 }
368 
369 /// Return true if we should legalize a load by widening an odd sized memory
370 /// access up to the alignment. Note this case when the memory access itself
371 /// changes, not the size of the result register.
372 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
373                             unsigned AlignInBits, unsigned AddrSpace,
374                             unsigned Opcode) {
375   // We don't want to widen cases that are naturally legal.
376   if (isPowerOf2_32(SizeInBits))
377     return false;
378 
379   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
380   // end up widening these for a scalar load during RegBankSelect, since there
381   // aren't 96-bit scalar loads.
382   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
383     return false;
384 
385   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
386     return false;
387 
388   // A load is known dereferenceable up to the alignment, so it's legal to widen
389   // to it.
390   //
391   // TODO: Could check dereferenceable for less aligned cases.
392   unsigned RoundedSize = NextPowerOf2(SizeInBits);
393   if (AlignInBits < RoundedSize)
394     return false;
395 
396   // Do not widen if it would introduce a slow unaligned load.
397   const SITargetLowering *TLI = ST.getTargetLowering();
398   bool Fast = false;
399   return TLI->allowsMisalignedMemoryAccessesImpl(
400              RoundedSize, AddrSpace, Align(AlignInBits / 8),
401              MachineMemOperand::MOLoad, &Fast) &&
402          Fast;
403 }
404 
405 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
406                             unsigned Opcode) {
407   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
408     return false;
409 
410   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
411                          Query.MMODescrs[0].AlignInBits,
412                          Query.Types[1].getAddressSpace(), Opcode);
413 }
414 
415 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
416                                          const GCNTargetMachine &TM)
417   :  ST(ST_) {
418   using namespace TargetOpcode;
419 
420   auto GetAddrSpacePtr = [&TM](unsigned AS) {
421     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
422   };
423 
424   const LLT S1 = LLT::scalar(1);
425   const LLT S8 = LLT::scalar(8);
426   const LLT S16 = LLT::scalar(16);
427   const LLT S32 = LLT::scalar(32);
428   const LLT S64 = LLT::scalar(64);
429   const LLT S128 = LLT::scalar(128);
430   const LLT S256 = LLT::scalar(256);
431   const LLT S512 = LLT::scalar(512);
432   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
433 
434   const LLT V2S8 = LLT::vector(2, 8);
435   const LLT V2S16 = LLT::vector(2, 16);
436   const LLT V4S16 = LLT::vector(4, 16);
437 
438   const LLT V2S32 = LLT::vector(2, 32);
439   const LLT V3S32 = LLT::vector(3, 32);
440   const LLT V4S32 = LLT::vector(4, 32);
441   const LLT V5S32 = LLT::vector(5, 32);
442   const LLT V6S32 = LLT::vector(6, 32);
443   const LLT V7S32 = LLT::vector(7, 32);
444   const LLT V8S32 = LLT::vector(8, 32);
445   const LLT V9S32 = LLT::vector(9, 32);
446   const LLT V10S32 = LLT::vector(10, 32);
447   const LLT V11S32 = LLT::vector(11, 32);
448   const LLT V12S32 = LLT::vector(12, 32);
449   const LLT V13S32 = LLT::vector(13, 32);
450   const LLT V14S32 = LLT::vector(14, 32);
451   const LLT V15S32 = LLT::vector(15, 32);
452   const LLT V16S32 = LLT::vector(16, 32);
453   const LLT V32S32 = LLT::vector(32, 32);
454 
455   const LLT V2S64 = LLT::vector(2, 64);
456   const LLT V3S64 = LLT::vector(3, 64);
457   const LLT V4S64 = LLT::vector(4, 64);
458   const LLT V5S64 = LLT::vector(5, 64);
459   const LLT V6S64 = LLT::vector(6, 64);
460   const LLT V7S64 = LLT::vector(7, 64);
461   const LLT V8S64 = LLT::vector(8, 64);
462   const LLT V16S64 = LLT::vector(16, 64);
463 
464   std::initializer_list<LLT> AllS32Vectors =
465     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
466      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
467   std::initializer_list<LLT> AllS64Vectors =
468     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
469 
470   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
471   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
472   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
473   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
474   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
475   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
476   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
477 
478   const LLT CodePtr = FlatPtr;
479 
480   const std::initializer_list<LLT> AddrSpaces64 = {
481     GlobalPtr, ConstantPtr, FlatPtr
482   };
483 
484   const std::initializer_list<LLT> AddrSpaces32 = {
485     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
486   };
487 
488   const std::initializer_list<LLT> FPTypesBase = {
489     S32, S64
490   };
491 
492   const std::initializer_list<LLT> FPTypes16 = {
493     S32, S64, S16
494   };
495 
496   const std::initializer_list<LLT> FPTypesPK16 = {
497     S32, S64, S16, V2S16
498   };
499 
500   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
501 
502   setAction({G_BRCOND, S1}, Legal); // VCC branches
503   setAction({G_BRCOND, S32}, Legal); // SCC branches
504 
505   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
506   // elements for v3s16
507   getActionDefinitionsBuilder(G_PHI)
508     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
509     .legalFor(AllS32Vectors)
510     .legalFor(AllS64Vectors)
511     .legalFor(AddrSpaces64)
512     .legalFor(AddrSpaces32)
513     .legalIf(isPointer(0))
514     .clampScalar(0, S16, S256)
515     .widenScalarToNextPow2(0, 32)
516     .clampMaxNumElements(0, S32, 16)
517     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
518     .scalarize(0);
519 
520   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
521     // Full set of gfx9 features.
522     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
523       .legalFor({S32, S16, V2S16})
524       .clampScalar(0, S16, S32)
525       .clampMaxNumElements(0, S16, 2)
526       .scalarize(0)
527       .widenScalarToNextPow2(0, 32);
528 
529     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
530       .legalFor({S32, S16, V2S16}) // Clamp modifier
531       .minScalarOrElt(0, S16)
532       .clampMaxNumElements(0, S16, 2)
533       .scalarize(0)
534       .widenScalarToNextPow2(0, 32)
535       .lower();
536   } else if (ST.has16BitInsts()) {
537     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
538       .legalFor({S32, S16})
539       .clampScalar(0, S16, S32)
540       .scalarize(0)
541       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
542 
543     // Technically the saturating operations require clamp bit support, but this
544     // was introduced at the same time as 16-bit operations.
545     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
546       .legalFor({S32, S16}) // Clamp modifier
547       .minScalar(0, S16)
548       .scalarize(0)
549       .widenScalarToNextPow2(0, 16)
550       .lower();
551 
552     // We're just lowering this, but it helps get a better result to try to
553     // coerce to the desired type first.
554     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
555       .minScalar(0, S16)
556       .scalarize(0)
557       .lower();
558   } else {
559     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
560       .legalFor({S32})
561       .clampScalar(0, S32, S32)
562       .scalarize(0);
563 
564     if (ST.hasIntClamp()) {
565       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
566         .legalFor({S32}) // Clamp modifier.
567         .scalarize(0)
568         .minScalarOrElt(0, S32)
569         .lower();
570     } else {
571       // Clamp bit support was added in VI, along with 16-bit operations.
572       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
573         .minScalar(0, S32)
574         .scalarize(0)
575         .lower();
576     }
577 
578     // FIXME: DAG expansion gets better results. The widening uses the smaller
579     // range values and goes for the min/max lowering directly.
580     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
581       .minScalar(0, S32)
582       .scalarize(0)
583       .lower();
584   }
585 
586   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
587     .customFor({S32, S64})
588     .clampScalar(0, S32, S64)
589     .widenScalarToNextPow2(0, 32)
590     .scalarize(0);
591 
592   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
593                    .legalFor({S32})
594                    .maxScalarOrElt(0, S32);
595 
596   if (ST.hasVOP3PInsts()) {
597     Mulh
598       .clampMaxNumElements(0, S8, 2)
599       .lowerFor({V2S8});
600   }
601 
602   Mulh
603     .scalarize(0)
604     .lower();
605 
606   // Report legal for any types we can handle anywhere. For the cases only legal
607   // on the SALU, RegBankSelect will be able to re-legalize.
608   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
609     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
610     .clampScalar(0, S32, S64)
611     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
612     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
613     .widenScalarToNextPow2(0)
614     .scalarize(0);
615 
616   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
617                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
618     .legalFor({{S32, S1}, {S32, S32}})
619     .minScalar(0, S32)
620     // TODO: .scalarize(0)
621     .lower();
622 
623   getActionDefinitionsBuilder(G_BITCAST)
624     // Don't worry about the size constraint.
625     .legalIf(all(isRegisterType(0), isRegisterType(1)))
626     .lower();
627 
628 
629   getActionDefinitionsBuilder(G_CONSTANT)
630     .legalFor({S1, S32, S64, S16, GlobalPtr,
631                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
632     .legalIf(isPointer(0))
633     .clampScalar(0, S32, S64)
634     .widenScalarToNextPow2(0);
635 
636   getActionDefinitionsBuilder(G_FCONSTANT)
637     .legalFor({S32, S64, S16})
638     .clampScalar(0, S16, S64);
639 
640   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
641       .legalIf(isRegisterType(0))
642       // s1 and s16 are special cases because they have legal operations on
643       // them, but don't really occupy registers in the normal way.
644       .legalFor({S1, S16})
645       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
646       .clampScalarOrElt(0, S32, MaxScalar)
647       .widenScalarToNextPow2(0, 32)
648       .clampMaxNumElements(0, S32, 16);
649 
650   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
651 
652   // If the amount is divergent, we have to do a wave reduction to get the
653   // maximum value, so this is expanded during RegBankSelect.
654   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
655     .legalFor({{PrivatePtr, S32}});
656 
657   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
658     .customIf(typeIsNot(0, PrivatePtr));
659 
660   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
661 
662   auto &FPOpActions = getActionDefinitionsBuilder(
663     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
664     .legalFor({S32, S64});
665   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
666     .customFor({S32, S64});
667   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
668     .customFor({S32, S64});
669 
670   if (ST.has16BitInsts()) {
671     if (ST.hasVOP3PInsts())
672       FPOpActions.legalFor({S16, V2S16});
673     else
674       FPOpActions.legalFor({S16});
675 
676     TrigActions.customFor({S16});
677     FDIVActions.customFor({S16});
678   }
679 
680   auto &MinNumMaxNum = getActionDefinitionsBuilder({
681       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
682 
683   if (ST.hasVOP3PInsts()) {
684     MinNumMaxNum.customFor(FPTypesPK16)
685       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
686       .clampMaxNumElements(0, S16, 2)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else if (ST.has16BitInsts()) {
690     MinNumMaxNum.customFor(FPTypes16)
691       .clampScalar(0, S16, S64)
692       .scalarize(0);
693   } else {
694     MinNumMaxNum.customFor(FPTypesBase)
695       .clampScalar(0, S32, S64)
696       .scalarize(0);
697   }
698 
699   if (ST.hasVOP3PInsts())
700     FPOpActions.clampMaxNumElements(0, S16, 2);
701 
702   FPOpActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   TrigActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   FDIVActions
711     .scalarize(0)
712     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
713 
714   getActionDefinitionsBuilder({G_FNEG, G_FABS})
715     .legalFor(FPTypesPK16)
716     .clampMaxNumElements(0, S16, 2)
717     .scalarize(0)
718     .clampScalar(0, S16, S64);
719 
720   if (ST.has16BitInsts()) {
721     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
722       .legalFor({S32, S64, S16})
723       .scalarize(0)
724       .clampScalar(0, S16, S64);
725   } else {
726     getActionDefinitionsBuilder(G_FSQRT)
727       .legalFor({S32, S64})
728       .scalarize(0)
729       .clampScalar(0, S32, S64);
730 
731     if (ST.hasFractBug()) {
732       getActionDefinitionsBuilder(G_FFLOOR)
733         .customFor({S64})
734         .legalFor({S32, S64})
735         .scalarize(0)
736         .clampScalar(0, S32, S64);
737     } else {
738       getActionDefinitionsBuilder(G_FFLOOR)
739         .legalFor({S32, S64})
740         .scalarize(0)
741         .clampScalar(0, S32, S64);
742     }
743   }
744 
745   getActionDefinitionsBuilder(G_FPTRUNC)
746     .legalFor({{S32, S64}, {S16, S32}})
747     .scalarize(0)
748     .lower();
749 
750   getActionDefinitionsBuilder(G_FPEXT)
751     .legalFor({{S64, S32}, {S32, S16}})
752     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
753     .scalarize(0);
754 
755   getActionDefinitionsBuilder(G_FSUB)
756       // Use actual fsub instruction
757       .legalFor({S32})
758       // Must use fadd + fneg
759       .lowerFor({S64, S16, V2S16})
760       .scalarize(0)
761       .clampScalar(0, S32, S64);
762 
763   // Whether this is legal depends on the floating point mode for the function.
764   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
765   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
766     FMad.customFor({S32, S16});
767   else if (ST.hasMadMacF32Insts())
768     FMad.customFor({S32});
769   else if (ST.hasMadF16())
770     FMad.customFor({S16});
771   FMad.scalarize(0)
772       .lower();
773 
774   auto &FRem = getActionDefinitionsBuilder(G_FREM);
775   if (ST.has16BitInsts()) {
776     FRem.customFor({S16, S32, S64});
777   } else {
778     FRem.minScalar(0, S32)
779         .customFor({S32, S64});
780   }
781   FRem.scalarize(0);
782 
783   // TODO: Do we need to clamp maximum bitwidth?
784   getActionDefinitionsBuilder(G_TRUNC)
785     .legalIf(isScalar(0))
786     .legalFor({{V2S16, V2S32}})
787     .clampMaxNumElements(0, S16, 2)
788     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
789     // situations (like an invalid implicit use), we don't want to infinite loop
790     // in the legalizer.
791     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
792     .alwaysLegal();
793 
794   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
795     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
796                {S32, S1}, {S64, S1}, {S16, S1}})
797     .scalarize(0)
798     .clampScalar(0, S32, S64)
799     .widenScalarToNextPow2(1, 32);
800 
801   // TODO: Split s1->s64 during regbankselect for VALU.
802   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
803     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
804     .lowerFor({{S32, S64}})
805     .lowerIf(typeIs(1, S1))
806     .customFor({{S64, S64}});
807   if (ST.has16BitInsts())
808     IToFP.legalFor({{S16, S16}});
809   IToFP.clampScalar(1, S32, S64)
810        .minScalar(0, S32)
811        .scalarize(0)
812        .widenScalarToNextPow2(1);
813 
814   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
815     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
816     .customFor({{S64, S64}})
817     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
818   if (ST.has16BitInsts())
819     FPToI.legalFor({{S16, S16}});
820   else
821     FPToI.minScalar(1, S32);
822 
823   FPToI.minScalar(0, S32)
824        .scalarize(0)
825        .lower();
826 
827   // Lower roundeven into G_FRINT
828   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
829     .scalarize(0)
830     .lower();
831 
832   if (ST.has16BitInsts()) {
833     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
834       .legalFor({S16, S32, S64})
835       .clampScalar(0, S16, S64)
836       .scalarize(0);
837   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
838     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
839       .legalFor({S32, S64})
840       .clampScalar(0, S32, S64)
841       .scalarize(0);
842   } else {
843     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
844       .legalFor({S32})
845       .customFor({S64})
846       .clampScalar(0, S32, S64)
847       .scalarize(0);
848   }
849 
850   getActionDefinitionsBuilder(G_PTR_ADD)
851     .legalIf(all(isPointer(0), sameSize(0, 1)))
852     .scalarize(0)
853     .scalarSameSizeAs(1, 0);
854 
855   getActionDefinitionsBuilder(G_PTRMASK)
856     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
857     .scalarSameSizeAs(1, 0)
858     .scalarize(0);
859 
860   auto &CmpBuilder =
861     getActionDefinitionsBuilder(G_ICMP)
862     // The compare output type differs based on the register bank of the output,
863     // so make both s1 and s32 legal.
864     //
865     // Scalar compares producing output in scc will be promoted to s32, as that
866     // is the allocatable register type that will be needed for the copy from
867     // scc. This will be promoted during RegBankSelect, and we assume something
868     // before that won't try to use s32 result types.
869     //
870     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
871     // bank.
872     .legalForCartesianProduct(
873       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
874     .legalForCartesianProduct(
875       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
876   if (ST.has16BitInsts()) {
877     CmpBuilder.legalFor({{S1, S16}});
878   }
879 
880   CmpBuilder
881     .widenScalarToNextPow2(1)
882     .clampScalar(1, S32, S64)
883     .scalarize(0)
884     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
885 
886   getActionDefinitionsBuilder(G_FCMP)
887     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
888     .widenScalarToNextPow2(1)
889     .clampScalar(1, S32, S64)
890     .scalarize(0);
891 
892   // FIXME: fpow has a selection pattern that should move to custom lowering.
893   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
894   if (ST.has16BitInsts())
895     Exp2Ops.legalFor({S32, S16});
896   else
897     Exp2Ops.legalFor({S32});
898   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
899   Exp2Ops.scalarize(0);
900 
901   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
902   if (ST.has16BitInsts())
903     ExpOps.customFor({{S32}, {S16}});
904   else
905     ExpOps.customFor({S32});
906   ExpOps.clampScalar(0, MinScalarFPTy, S32)
907         .scalarize(0);
908 
909   getActionDefinitionsBuilder(G_FPOWI)
910     .clampScalar(0, MinScalarFPTy, S32)
911     .lower();
912 
913   // The 64-bit versions produce 32-bit results, but only on the SALU.
914   getActionDefinitionsBuilder(G_CTPOP)
915     .legalFor({{S32, S32}, {S32, S64}})
916     .clampScalar(0, S32, S32)
917     .clampScalar(1, S32, S64)
918     .scalarize(0)
919     .widenScalarToNextPow2(0, 32)
920     .widenScalarToNextPow2(1, 32);
921 
922   // The hardware instructions return a different result on 0 than the generic
923   // instructions expect. The hardware produces -1, but these produce the
924   // bitwidth.
925   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
926     .scalarize(0)
927     .clampScalar(0, S32, S32)
928     .clampScalar(1, S32, S64)
929     .widenScalarToNextPow2(0, 32)
930     .widenScalarToNextPow2(1, 32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .clampScalar(1, S32, S64)
938     .scalarize(0)
939     .widenScalarToNextPow2(0, 32)
940     .widenScalarToNextPow2(1, 32);
941 
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32})
944     .clampScalar(0, S32, S32)
945     .scalarize(0);
946 
947   if (ST.has16BitInsts()) {
948     getActionDefinitionsBuilder(G_BSWAP)
949       .legalFor({S16, S32, V2S16})
950       .clampMaxNumElements(0, S16, 2)
951       // FIXME: Fixing non-power-of-2 before clamp is workaround for
952       // narrowScalar limitation.
953       .widenScalarToNextPow2(0)
954       .clampScalar(0, S16, S32)
955       .scalarize(0);
956 
957     if (ST.hasVOP3PInsts()) {
958       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
959         .legalFor({S32, S16, V2S16})
960         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
961         .clampMaxNumElements(0, S16, 2)
962         .minScalar(0, S16)
963         .widenScalarToNextPow2(0)
964         .scalarize(0)
965         .lower();
966     } else {
967       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
968         .legalFor({S32, S16})
969         .widenScalarToNextPow2(0)
970         .minScalar(0, S16)
971         .scalarize(0)
972         .lower();
973     }
974   } else {
975     // TODO: Should have same legality without v_perm_b32
976     getActionDefinitionsBuilder(G_BSWAP)
977       .legalFor({S32})
978       .lowerIf(scalarNarrowerThan(0, 32))
979       // FIXME: Fixing non-power-of-2 before clamp is workaround for
980       // narrowScalar limitation.
981       .widenScalarToNextPow2(0)
982       .maxScalar(0, S32)
983       .scalarize(0)
984       .lower();
985 
986     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
987       .legalFor({S32})
988       .minScalar(0, S32)
989       .widenScalarToNextPow2(0)
990       .scalarize(0)
991       .lower();
992   }
993 
994   getActionDefinitionsBuilder(G_INTTOPTR)
995     // List the common cases
996     .legalForCartesianProduct(AddrSpaces64, {S64})
997     .legalForCartesianProduct(AddrSpaces32, {S32})
998     .scalarize(0)
999     // Accept any address space as long as the size matches
1000     .legalIf(sameSize(0, 1))
1001     .widenScalarIf(smallerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       })
1005     .narrowScalarIf(largerThan(1, 0),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1008       });
1009 
1010   getActionDefinitionsBuilder(G_PTRTOINT)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(0, 1),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1020       })
1021     .narrowScalarIf(
1022       largerThan(0, 1),
1023       [](const LegalityQuery &Query) {
1024         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1025       });
1026 
1027   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1028     .scalarize(0)
1029     .custom();
1030 
1031   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1032                                     bool IsLoad) -> bool {
1033     const LLT DstTy = Query.Types[0];
1034 
1035     // Split vector extloads.
1036     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1037     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1038 
1039     if (MemSize < DstTy.getSizeInBits())
1040       MemSize = std::max(MemSize, AlignBits);
1041 
1042     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1043       return true;
1044 
1045     const LLT PtrTy = Query.Types[1];
1046     unsigned AS = PtrTy.getAddressSpace();
1047     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1048       return true;
1049 
1050     // Catch weird sized loads that don't evenly divide into the access sizes
1051     // TODO: May be able to widen depending on alignment etc.
1052     unsigned NumRegs = (MemSize + 31) / 32;
1053     if (NumRegs == 3) {
1054       if (!ST.hasDwordx3LoadStores())
1055         return true;
1056     } else {
1057       // If the alignment allows, these should have been widened.
1058       if (!isPowerOf2_32(NumRegs))
1059         return true;
1060     }
1061 
1062     if (AlignBits < MemSize) {
1063       const SITargetLowering *TLI = ST.getTargetLowering();
1064       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1065                                                       Align(AlignBits / 8));
1066     }
1067 
1068     return false;
1069   };
1070 
1071   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1072   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1073   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1074 
1075   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1076   // LDS
1077   // TODO: Unsupported flat for SI.
1078 
1079   for (unsigned Op : {G_LOAD, G_STORE}) {
1080     const bool IsStore = Op == G_STORE;
1081 
1082     auto &Actions = getActionDefinitionsBuilder(Op);
1083     // Explicitly list some common cases.
1084     // TODO: Does this help compile time at all?
1085     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1086                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1087                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1088                                       {S64, GlobalPtr, 64, GlobalAlign32},
1089                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1090                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1091                                       {S32, GlobalPtr, 8, GlobalAlign8},
1092                                       {S32, GlobalPtr, 16, GlobalAlign16},
1093 
1094                                       {S32, LocalPtr, 32, 32},
1095                                       {S64, LocalPtr, 64, 32},
1096                                       {V2S32, LocalPtr, 64, 32},
1097                                       {S32, LocalPtr, 8, 8},
1098                                       {S32, LocalPtr, 16, 16},
1099                                       {V2S16, LocalPtr, 32, 32},
1100 
1101                                       {S32, PrivatePtr, 32, 32},
1102                                       {S32, PrivatePtr, 8, 8},
1103                                       {S32, PrivatePtr, 16, 16},
1104                                       {V2S16, PrivatePtr, 32, 32},
1105 
1106                                       {S32, ConstantPtr, 32, GlobalAlign32},
1107                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1108                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1109                                       {S64, ConstantPtr, 64, GlobalAlign32},
1110                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1111     Actions.legalIf(
1112       [=](const LegalityQuery &Query) -> bool {
1113         return isLoadStoreLegal(ST, Query, Op);
1114       });
1115 
1116     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1117     // 64-bits.
1118     //
1119     // TODO: Should generalize bitcast action into coerce, which will also cover
1120     // inserting addrspacecasts.
1121     Actions.customIf(typeIs(1, Constant32Ptr));
1122 
1123     // Turn any illegal element vectors into something easier to deal
1124     // with. These will ultimately produce 32-bit scalar shifts to extract the
1125     // parts anyway.
1126     //
1127     // For odd 16-bit element vectors, prefer to split those into pieces with
1128     // 16-bit vector parts.
1129     Actions.bitcastIf(
1130       [=](const LegalityQuery &Query) -> bool {
1131         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1132                                           Query.MMODescrs[0].SizeInBits);
1133       }, bitcastToRegisterType(0));
1134 
1135     if (!IsStore) {
1136       // Widen suitably aligned loads by loading extra bytes. The standard
1137       // legalization actions can't properly express widening memory operands.
1138       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1139         return shouldWidenLoad(ST, Query, G_LOAD);
1140       });
1141     }
1142 
1143     // FIXME: load/store narrowing should be moved to lower action
1144     Actions
1145         .narrowScalarIf(
1146             [=](const LegalityQuery &Query) -> bool {
1147               return !Query.Types[0].isVector() &&
1148                      needToSplitMemOp(Query, Op == G_LOAD);
1149             },
1150             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1151               const LLT DstTy = Query.Types[0];
1152               const LLT PtrTy = Query.Types[1];
1153 
1154               const unsigned DstSize = DstTy.getSizeInBits();
1155               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1156 
1157               // Split extloads.
1158               if (DstSize > MemSize)
1159                 return std::make_pair(0, LLT::scalar(MemSize));
1160 
1161               if (!isPowerOf2_32(DstSize)) {
1162                 // We're probably decomposing an odd sized store. Try to split
1163                 // to the widest type. TODO: Account for alignment. As-is it
1164                 // should be OK, since the new parts will be further legalized.
1165                 unsigned FloorSize = PowerOf2Floor(DstSize);
1166                 return std::make_pair(0, LLT::scalar(FloorSize));
1167               }
1168 
1169               if (DstSize > 32 && (DstSize % 32 != 0)) {
1170                 // FIXME: Need a way to specify non-extload of larger size if
1171                 // suitably aligned.
1172                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1173               }
1174 
1175               unsigned MaxSize = maxSizeForAddrSpace(ST,
1176                                                      PtrTy.getAddressSpace(),
1177                                                      Op == G_LOAD);
1178               if (MemSize > MaxSize)
1179                 return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181               unsigned Align = Query.MMODescrs[0].AlignInBits;
1182               return std::make_pair(0, LLT::scalar(Align));
1183             })
1184         .fewerElementsIf(
1185             [=](const LegalityQuery &Query) -> bool {
1186               return Query.Types[0].isVector() &&
1187                      needToSplitMemOp(Query, Op == G_LOAD);
1188             },
1189             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190               const LLT DstTy = Query.Types[0];
1191               const LLT PtrTy = Query.Types[1];
1192 
1193               LLT EltTy = DstTy.getElementType();
1194               unsigned MaxSize = maxSizeForAddrSpace(ST,
1195                                                      PtrTy.getAddressSpace(),
1196                                                      Op == G_LOAD);
1197 
1198               // FIXME: Handle widened to power of 2 results better. This ends
1199               // up scalarizing.
1200               // FIXME: 3 element stores scalarized on SI
1201 
1202               // Split if it's too large for the address space.
1203               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1204                 unsigned NumElts = DstTy.getNumElements();
1205                 unsigned EltSize = EltTy.getSizeInBits();
1206 
1207                 if (MaxSize % EltSize == 0) {
1208                   return std::make_pair(
1209                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1210                 }
1211 
1212                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1213 
1214                 // FIXME: Refine when odd breakdowns handled
1215                 // The scalars will need to be re-legalized.
1216                 if (NumPieces == 1 || NumPieces >= NumElts ||
1217                     NumElts % NumPieces != 0)
1218                   return std::make_pair(0, EltTy);
1219 
1220                 return std::make_pair(0,
1221                                       LLT::vector(NumElts / NumPieces, EltTy));
1222               }
1223 
1224               // FIXME: We could probably handle weird extending loads better.
1225               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1226               if (DstTy.getSizeInBits() > MemSize)
1227                 return std::make_pair(0, EltTy);
1228 
1229               unsigned EltSize = EltTy.getSizeInBits();
1230               unsigned DstSize = DstTy.getSizeInBits();
1231               if (!isPowerOf2_32(DstSize)) {
1232                 // We're probably decomposing an odd sized store. Try to split
1233                 // to the widest type. TODO: Account for alignment. As-is it
1234                 // should be OK, since the new parts will be further legalized.
1235                 unsigned FloorSize = PowerOf2Floor(DstSize);
1236                 return std::make_pair(
1237                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1238               }
1239 
1240               // Need to split because of alignment.
1241               unsigned Align = Query.MMODescrs[0].AlignInBits;
1242               if (EltSize > Align &&
1243                   (EltSize / Align < DstTy.getNumElements())) {
1244                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1245               }
1246 
1247               // May need relegalization for the scalars.
1248               return std::make_pair(0, EltTy);
1249             })
1250     .lowerIfMemSizeNotPow2()
1251     .minScalar(0, S32);
1252 
1253     if (IsStore)
1254       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1255 
1256     Actions
1257         .widenScalarToNextPow2(0)
1258         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1259         .lower();
1260   }
1261 
1262   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1263                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1264                                                   {S32, GlobalPtr, 16, 2 * 8},
1265                                                   {S32, LocalPtr, 8, 8},
1266                                                   {S32, LocalPtr, 16, 16},
1267                                                   {S32, PrivatePtr, 8, 8},
1268                                                   {S32, PrivatePtr, 16, 16},
1269                                                   {S32, ConstantPtr, 8, 8},
1270                                                   {S32, ConstantPtr, 16, 2 * 8}});
1271   if (ST.hasFlatAddressSpace()) {
1272     ExtLoads.legalForTypesWithMemDesc(
1273         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1274   }
1275 
1276   ExtLoads.clampScalar(0, S32, S32)
1277           .widenScalarToNextPow2(0)
1278           .unsupportedIfMemSizeNotPow2()
1279           .lower();
1280 
1281   auto &Atomics = getActionDefinitionsBuilder(
1282     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1283      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1284      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1285      G_ATOMICRMW_UMIN})
1286     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1287                {S64, GlobalPtr}, {S64, LocalPtr},
1288                {S32, RegionPtr}, {S64, RegionPtr}});
1289   if (ST.hasFlatAddressSpace()) {
1290     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1291   }
1292 
1293   if (ST.hasLDSFPAtomics()) {
1294     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1295       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1296   }
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1310           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1311           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1312     .clampScalar(0, S16, S64)
1313     .scalarize(1)
1314     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1315     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1316     .clampMaxNumElements(0, S32, 2)
1317     .clampMaxNumElements(0, LocalPtr, 2)
1318     .clampMaxNumElements(0, PrivatePtr, 2)
1319     .scalarize(0)
1320     .widenScalarToNextPow2(0)
1321     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1322 
1323   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1324   // be more flexible with the shift amount type.
1325   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1326     .legalFor({{S32, S32}, {S64, S32}});
1327   if (ST.has16BitInsts()) {
1328     if (ST.hasVOP3PInsts()) {
1329       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1330             .clampMaxNumElements(0, S16, 2);
1331     } else
1332       Shifts.legalFor({{S16, S16}});
1333 
1334     // TODO: Support 16-bit shift amounts for all types
1335     Shifts.widenScalarIf(
1336       [=](const LegalityQuery &Query) {
1337         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1338         // 32-bit amount.
1339         const LLT ValTy = Query.Types[0];
1340         const LLT AmountTy = Query.Types[1];
1341         return ValTy.getSizeInBits() <= 16 &&
1342                AmountTy.getSizeInBits() < 16;
1343       }, changeTo(1, S16));
1344     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1345     Shifts.clampScalar(1, S32, S32);
1346     Shifts.clampScalar(0, S16, S64);
1347     Shifts.widenScalarToNextPow2(0, 16);
1348 
1349     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350       .minScalar(0, S16)
1351       .scalarize(0)
1352       .lower();
1353   } else {
1354     // Make sure we legalize the shift amount type first, as the general
1355     // expansion for the shifted type will produce much worse code if it hasn't
1356     // been truncated already.
1357     Shifts.clampScalar(1, S32, S32);
1358     Shifts.clampScalar(0, S32, S64);
1359     Shifts.widenScalarToNextPow2(0, 32);
1360 
1361     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1362       .minScalar(0, S32)
1363       .scalarize(0)
1364       .lower();
1365   }
1366   Shifts.scalarize(0);
1367 
1368   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1369     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1370     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1371     unsigned IdxTypeIdx = 2;
1372 
1373     getActionDefinitionsBuilder(Op)
1374       .customIf([=](const LegalityQuery &Query) {
1375           const LLT EltTy = Query.Types[EltTypeIdx];
1376           const LLT VecTy = Query.Types[VecTypeIdx];
1377           const LLT IdxTy = Query.Types[IdxTypeIdx];
1378           const unsigned EltSize = EltTy.getSizeInBits();
1379           return (EltSize == 32 || EltSize == 64) &&
1380                   VecTy.getSizeInBits() % 32 == 0 &&
1381                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1382                   IdxTy.getSizeInBits() == 32;
1383         })
1384       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1385                  bitcastToVectorElement32(VecTypeIdx))
1386       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1387       .bitcastIf(
1388         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1389         [=](const LegalityQuery &Query) {
1390           // For > 64-bit element types, try to turn this into a 64-bit
1391           // element vector since we may be able to do better indexing
1392           // if this is scalar. If not, fall back to 32.
1393           const LLT EltTy = Query.Types[EltTypeIdx];
1394           const LLT VecTy = Query.Types[VecTypeIdx];
1395           const unsigned DstEltSize = EltTy.getSizeInBits();
1396           const unsigned VecSize = VecTy.getSizeInBits();
1397 
1398           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1399           return std::make_pair(
1400             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1401         })
1402       .clampScalar(EltTypeIdx, S32, S64)
1403       .clampScalar(VecTypeIdx, S32, S64)
1404       .clampScalar(IdxTypeIdx, S32, S32)
1405       .clampMaxNumElements(VecTypeIdx, S32, 32)
1406       // TODO: Clamp elements for 64-bit vectors?
1407       // It should only be necessary with variable indexes.
1408       // As a last resort, lower to the stack
1409       .lower();
1410   }
1411 
1412   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1413     .unsupportedIf([=](const LegalityQuery &Query) {
1414         const LLT &EltTy = Query.Types[1].getElementType();
1415         return Query.Types[0] != EltTy;
1416       });
1417 
1418   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1419     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1420     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1421 
1422     // FIXME: Doesn't handle extract of illegal sizes.
1423     getActionDefinitionsBuilder(Op)
1424       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1425       // FIXME: Multiples of 16 should not be legal.
1426       .legalIf([=](const LegalityQuery &Query) {
1427           const LLT BigTy = Query.Types[BigTyIdx];
1428           const LLT LitTy = Query.Types[LitTyIdx];
1429           return (BigTy.getSizeInBits() % 32 == 0) &&
1430                  (LitTy.getSizeInBits() % 16 == 0);
1431         })
1432       .widenScalarIf(
1433         [=](const LegalityQuery &Query) {
1434           const LLT BigTy = Query.Types[BigTyIdx];
1435           return (BigTy.getScalarSizeInBits() < 16);
1436         },
1437         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1438       .widenScalarIf(
1439         [=](const LegalityQuery &Query) {
1440           const LLT LitTy = Query.Types[LitTyIdx];
1441           return (LitTy.getScalarSizeInBits() < 16);
1442         },
1443         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1444       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1445       .widenScalarToNextPow2(BigTyIdx, 32);
1446 
1447   }
1448 
1449   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1450     .legalForCartesianProduct(AllS32Vectors, {S32})
1451     .legalForCartesianProduct(AllS64Vectors, {S64})
1452     .clampNumElements(0, V16S32, V32S32)
1453     .clampNumElements(0, V2S64, V16S64)
1454     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1455 
1456   if (ST.hasScalarPackInsts()) {
1457     BuildVector
1458       // FIXME: Should probably widen s1 vectors straight to s32
1459       .minScalarOrElt(0, S16)
1460       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1461       .minScalar(1, S32);
1462 
1463     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1464       .legalFor({V2S16, S32})
1465       .lower();
1466     BuildVector.minScalarOrElt(0, S32);
1467   } else {
1468     BuildVector.customFor({V2S16, S16});
1469     BuildVector.minScalarOrElt(0, S32);
1470 
1471     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1472       .customFor({V2S16, S32})
1473       .lower();
1474   }
1475 
1476   BuildVector.legalIf(isRegisterType(0));
1477 
1478   // FIXME: Clamp maximum size
1479   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1480     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1481     .clampMaxNumElements(0, S32, 32)
1482     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1483     .clampMaxNumElements(0, S16, 64);
1484 
1485   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1486   // pre-legalize.
1487   if (ST.hasVOP3PInsts()) {
1488     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1489       .customFor({V2S16, V2S16})
1490       .lower();
1491   } else
1492     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1493 
1494   // Merge/Unmerge
1495   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1496     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1497     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1498 
1499     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1500       const LLT Ty = Query.Types[TypeIdx];
1501       if (Ty.isVector()) {
1502         const LLT &EltTy = Ty.getElementType();
1503         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1504           return true;
1505         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1506           return true;
1507       }
1508       return false;
1509     };
1510 
1511     auto &Builder = getActionDefinitionsBuilder(Op)
1512       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513       .lowerFor({{S16, V2S16}})
1514       .lowerIf([=](const LegalityQuery &Query) {
1515           const LLT BigTy = Query.Types[BigTyIdx];
1516           return BigTy.getSizeInBits() == 32;
1517         })
1518       // Try to widen to s16 first for small types.
1519       // TODO: Only do this on targets with legal s16 shifts
1520       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1521       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1522       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1523       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1524                            elementTypeIs(1, S16)),
1525                        changeTo(1, V2S16))
1526       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1527       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1528       // valid.
1529       .clampScalar(LitTyIdx, S32, S512)
1530       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1531       // Break up vectors with weird elements into scalars
1532       .fewerElementsIf(
1533         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1534         scalarize(0))
1535       .fewerElementsIf(
1536         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1537         scalarize(1))
1538       .clampScalar(BigTyIdx, S32, MaxScalar);
1539 
1540     if (Op == G_MERGE_VALUES) {
1541       Builder.widenScalarIf(
1542         // TODO: Use 16-bit shifts if legal for 8-bit values?
1543         [=](const LegalityQuery &Query) {
1544           const LLT Ty = Query.Types[LitTyIdx];
1545           return Ty.getSizeInBits() < 32;
1546         },
1547         changeTo(LitTyIdx, S32));
1548     }
1549 
1550     Builder.widenScalarIf(
1551       [=](const LegalityQuery &Query) {
1552         const LLT Ty = Query.Types[BigTyIdx];
1553         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1554           Ty.getSizeInBits() % 16 != 0;
1555       },
1556       [=](const LegalityQuery &Query) {
1557         // Pick the next power of 2, or a multiple of 64 over 128.
1558         // Whichever is smaller.
1559         const LLT &Ty = Query.Types[BigTyIdx];
1560         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1561         if (NewSizeInBits >= 256) {
1562           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1563           if (RoundedTo < NewSizeInBits)
1564             NewSizeInBits = RoundedTo;
1565         }
1566         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1567       })
1568       // Any vectors left are the wrong size. Scalarize them.
1569       .scalarize(0)
1570       .scalarize(1);
1571   }
1572 
1573   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1574   // RegBankSelect.
1575   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1576     .legalFor({{S32}, {S64}});
1577 
1578   if (ST.hasVOP3PInsts()) {
1579     SextInReg.lowerFor({{V2S16}})
1580       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1581       // get more vector shift opportunities, since we'll get those when
1582       // expanded.
1583       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1584   } else if (ST.has16BitInsts()) {
1585     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1586   } else {
1587     // Prefer to promote to s32 before lowering if we don't have 16-bit
1588     // shifts. This avoid a lot of intermediate truncate and extend operations.
1589     SextInReg.lowerFor({{S32}, {S64}});
1590   }
1591 
1592   SextInReg
1593     .scalarize(0)
1594     .clampScalar(0, S32, S64)
1595     .lower();
1596 
1597   getActionDefinitionsBuilder(G_FSHR)
1598     .legalFor({{S32, S32}})
1599     .scalarize(0)
1600     .lower();
1601 
1602   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1603     .legalFor({S64});
1604 
1605   getActionDefinitionsBuilder(G_FENCE)
1606     .alwaysLegal();
1607 
1608   getActionDefinitionsBuilder({
1609       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1610       G_FCOPYSIGN,
1611 
1612       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1613       G_ATOMICRMW_NAND,
1614       G_ATOMICRMW_FSUB,
1615       G_READ_REGISTER,
1616       G_WRITE_REGISTER,
1617 
1618       G_SADDO, G_SSUBO,
1619 
1620        // TODO: Implement
1621       G_FMINIMUM, G_FMAXIMUM,
1622       G_FSHL
1623     }).lower();
1624 
1625   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1626         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1627         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1628     .unsupported();
1629 
1630   computeTables();
1631   verify(*ST.getInstrInfo());
1632 }
1633 
1634 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1635                                          MachineInstr &MI) const {
1636   MachineIRBuilder &B = Helper.MIRBuilder;
1637   MachineRegisterInfo &MRI = *B.getMRI();
1638 
1639   switch (MI.getOpcode()) {
1640   case TargetOpcode::G_ADDRSPACE_CAST:
1641     return legalizeAddrSpaceCast(MI, MRI, B);
1642   case TargetOpcode::G_FRINT:
1643     return legalizeFrint(MI, MRI, B);
1644   case TargetOpcode::G_FCEIL:
1645     return legalizeFceil(MI, MRI, B);
1646   case TargetOpcode::G_FREM:
1647     return legalizeFrem(MI, MRI, B);
1648   case TargetOpcode::G_INTRINSIC_TRUNC:
1649     return legalizeIntrinsicTrunc(MI, MRI, B);
1650   case TargetOpcode::G_SITOFP:
1651     return legalizeITOFP(MI, MRI, B, true);
1652   case TargetOpcode::G_UITOFP:
1653     return legalizeITOFP(MI, MRI, B, false);
1654   case TargetOpcode::G_FPTOSI:
1655     return legalizeFPTOI(MI, MRI, B, true);
1656   case TargetOpcode::G_FPTOUI:
1657     return legalizeFPTOI(MI, MRI, B, false);
1658   case TargetOpcode::G_FMINNUM:
1659   case TargetOpcode::G_FMAXNUM:
1660   case TargetOpcode::G_FMINNUM_IEEE:
1661   case TargetOpcode::G_FMAXNUM_IEEE:
1662     return legalizeMinNumMaxNum(Helper, MI);
1663   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1664     return legalizeExtractVectorElt(MI, MRI, B);
1665   case TargetOpcode::G_INSERT_VECTOR_ELT:
1666     return legalizeInsertVectorElt(MI, MRI, B);
1667   case TargetOpcode::G_SHUFFLE_VECTOR:
1668     return legalizeShuffleVector(MI, MRI, B);
1669   case TargetOpcode::G_FSIN:
1670   case TargetOpcode::G_FCOS:
1671     return legalizeSinCos(MI, MRI, B);
1672   case TargetOpcode::G_GLOBAL_VALUE:
1673     return legalizeGlobalValue(MI, MRI, B);
1674   case TargetOpcode::G_LOAD:
1675     return legalizeLoad(Helper, MI);
1676   case TargetOpcode::G_FMAD:
1677     return legalizeFMad(MI, MRI, B);
1678   case TargetOpcode::G_FDIV:
1679     return legalizeFDIV(MI, MRI, B);
1680   case TargetOpcode::G_UDIV:
1681   case TargetOpcode::G_UREM:
1682     return legalizeUDIV_UREM(MI, MRI, B);
1683   case TargetOpcode::G_SDIV:
1684   case TargetOpcode::G_SREM:
1685     return legalizeSDIV_SREM(MI, MRI, B);
1686   case TargetOpcode::G_ATOMIC_CMPXCHG:
1687     return legalizeAtomicCmpXChg(MI, MRI, B);
1688   case TargetOpcode::G_FLOG:
1689     return legalizeFlog(MI, B, numbers::ln2f);
1690   case TargetOpcode::G_FLOG10:
1691     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1692   case TargetOpcode::G_FEXP:
1693     return legalizeFExp(MI, B);
1694   case TargetOpcode::G_FPOW:
1695     return legalizeFPow(MI, B);
1696   case TargetOpcode::G_FFLOOR:
1697     return legalizeFFloor(MI, MRI, B);
1698   case TargetOpcode::G_BUILD_VECTOR:
1699     return legalizeBuildVector(MI, MRI, B);
1700   default:
1701     return false;
1702   }
1703 
1704   llvm_unreachable("expected switch to return");
1705 }
1706 
1707 Register AMDGPULegalizerInfo::getSegmentAperture(
1708   unsigned AS,
1709   MachineRegisterInfo &MRI,
1710   MachineIRBuilder &B) const {
1711   MachineFunction &MF = B.getMF();
1712   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1713   const LLT S32 = LLT::scalar(32);
1714 
1715   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1716 
1717   if (ST.hasApertureRegs()) {
1718     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1719     // getreg.
1720     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1721         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1722         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1723     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1724         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1725         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1726     unsigned Encoding =
1727         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1728         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1729         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1730 
1731     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1732 
1733     B.buildInstr(AMDGPU::S_GETREG_B32)
1734       .addDef(GetReg)
1735       .addImm(Encoding);
1736     MRI.setType(GetReg, S32);
1737 
1738     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1739     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1740   }
1741 
1742   Register QueuePtr = MRI.createGenericVirtualRegister(
1743     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1744 
1745   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1746     return Register();
1747 
1748   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1749   // private_segment_aperture_base_hi.
1750   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1751 
1752   // TODO: can we be smarter about machine pointer info?
1753   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1754   MachineMemOperand *MMO = MF.getMachineMemOperand(
1755       PtrInfo,
1756       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1757           MachineMemOperand::MOInvariant,
1758       4, commonAlignment(Align(64), StructOffset));
1759 
1760   Register LoadAddr;
1761 
1762   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1763   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1764 }
1765 
1766 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1767   MachineInstr &MI, MachineRegisterInfo &MRI,
1768   MachineIRBuilder &B) const {
1769   MachineFunction &MF = B.getMF();
1770 
1771   const LLT S32 = LLT::scalar(32);
1772   Register Dst = MI.getOperand(0).getReg();
1773   Register Src = MI.getOperand(1).getReg();
1774 
1775   LLT DstTy = MRI.getType(Dst);
1776   LLT SrcTy = MRI.getType(Src);
1777   unsigned DestAS = DstTy.getAddressSpace();
1778   unsigned SrcAS = SrcTy.getAddressSpace();
1779 
1780   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1781   // vector element.
1782   assert(!DstTy.isVector());
1783 
1784   const AMDGPUTargetMachine &TM
1785     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1786 
1787   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1788     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1789     return true;
1790   }
1791 
1792   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1793     // Truncate.
1794     B.buildExtract(Dst, Src, 0);
1795     MI.eraseFromParent();
1796     return true;
1797   }
1798 
1799   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1800     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1801     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1802 
1803     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1804     // another. Merge operands are required to be the same type, but creating an
1805     // extra ptrtoint would be kind of pointless.
1806     auto HighAddr = B.buildConstant(
1807       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1808     B.buildMerge(Dst, {Src, HighAddr});
1809     MI.eraseFromParent();
1810     return true;
1811   }
1812 
1813   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1814     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1815            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1816     unsigned NullVal = TM.getNullPointerValue(DestAS);
1817 
1818     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1819     auto FlatNull = B.buildConstant(SrcTy, 0);
1820 
1821     // Extract low 32-bits of the pointer.
1822     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1823 
1824     auto CmpRes =
1825         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1826     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1827 
1828     MI.eraseFromParent();
1829     return true;
1830   }
1831 
1832   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1833     return false;
1834 
1835   if (!ST.hasFlatAddressSpace())
1836     return false;
1837 
1838   auto SegmentNull =
1839       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1840   auto FlatNull =
1841       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1842 
1843   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1844   if (!ApertureReg.isValid())
1845     return false;
1846 
1847   auto CmpRes =
1848       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1849 
1850   // Coerce the type of the low half of the result so we can use merge_values.
1851   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1852 
1853   // TODO: Should we allow mismatched types but matching sizes in merges to
1854   // avoid the ptrtoint?
1855   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1856   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1857 
1858   MI.eraseFromParent();
1859   return true;
1860 }
1861 
1862 bool AMDGPULegalizerInfo::legalizeFrint(
1863   MachineInstr &MI, MachineRegisterInfo &MRI,
1864   MachineIRBuilder &B) const {
1865   Register Src = MI.getOperand(1).getReg();
1866   LLT Ty = MRI.getType(Src);
1867   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1868 
1869   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1870   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1871 
1872   auto C1 = B.buildFConstant(Ty, C1Val);
1873   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1874 
1875   // TODO: Should this propagate fast-math-flags?
1876   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1877   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1878 
1879   auto C2 = B.buildFConstant(Ty, C2Val);
1880   auto Fabs = B.buildFAbs(Ty, Src);
1881 
1882   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1883   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1884   MI.eraseFromParent();
1885   return true;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::legalizeFceil(
1889   MachineInstr &MI, MachineRegisterInfo &MRI,
1890   MachineIRBuilder &B) const {
1891 
1892   const LLT S1 = LLT::scalar(1);
1893   const LLT S64 = LLT::scalar(64);
1894 
1895   Register Src = MI.getOperand(1).getReg();
1896   assert(MRI.getType(Src) == S64);
1897 
1898   // result = trunc(src)
1899   // if (src > 0.0 && src != result)
1900   //   result += 1.0
1901 
1902   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1903 
1904   const auto Zero = B.buildFConstant(S64, 0.0);
1905   const auto One = B.buildFConstant(S64, 1.0);
1906   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1907   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1908   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1909   auto Add = B.buildSelect(S64, And, One, Zero);
1910 
1911   // TODO: Should this propagate fast-math-flags?
1912   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1913   return true;
1914 }
1915 
1916 bool AMDGPULegalizerInfo::legalizeFrem(
1917   MachineInstr &MI, MachineRegisterInfo &MRI,
1918   MachineIRBuilder &B) const {
1919     Register DstReg = MI.getOperand(0).getReg();
1920     Register Src0Reg = MI.getOperand(1).getReg();
1921     Register Src1Reg = MI.getOperand(2).getReg();
1922     auto Flags = MI.getFlags();
1923     LLT Ty = MRI.getType(DstReg);
1924 
1925     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1926     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1927     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1928     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1929     MI.eraseFromParent();
1930     return true;
1931 }
1932 
1933 static MachineInstrBuilder extractF64Exponent(Register Hi,
1934                                               MachineIRBuilder &B) {
1935   const unsigned FractBits = 52;
1936   const unsigned ExpBits = 11;
1937   LLT S32 = LLT::scalar(32);
1938 
1939   auto Const0 = B.buildConstant(S32, FractBits - 32);
1940   auto Const1 = B.buildConstant(S32, ExpBits);
1941 
1942   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1943     .addUse(Hi)
1944     .addUse(Const0.getReg(0))
1945     .addUse(Const1.getReg(0));
1946 
1947   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1948 }
1949 
1950 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1951   MachineInstr &MI, MachineRegisterInfo &MRI,
1952   MachineIRBuilder &B) const {
1953   const LLT S1 = LLT::scalar(1);
1954   const LLT S32 = LLT::scalar(32);
1955   const LLT S64 = LLT::scalar(64);
1956 
1957   Register Src = MI.getOperand(1).getReg();
1958   assert(MRI.getType(Src) == S64);
1959 
1960   // TODO: Should this use extract since the low half is unused?
1961   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1962   Register Hi = Unmerge.getReg(1);
1963 
1964   // Extract the upper half, since this is where we will find the sign and
1965   // exponent.
1966   auto Exp = extractF64Exponent(Hi, B);
1967 
1968   const unsigned FractBits = 52;
1969 
1970   // Extract the sign bit.
1971   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1972   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1973 
1974   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1975 
1976   const auto Zero32 = B.buildConstant(S32, 0);
1977 
1978   // Extend back to 64-bits.
1979   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1980 
1981   auto Shr = B.buildAShr(S64, FractMask, Exp);
1982   auto Not = B.buildNot(S64, Shr);
1983   auto Tmp0 = B.buildAnd(S64, Src, Not);
1984   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1985 
1986   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1987   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1988 
1989   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1990   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1991   MI.eraseFromParent();
1992   return true;
1993 }
1994 
1995 bool AMDGPULegalizerInfo::legalizeITOFP(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B, bool Signed) const {
1998 
1999   Register Dst = MI.getOperand(0).getReg();
2000   Register Src = MI.getOperand(1).getReg();
2001 
2002   const LLT S64 = LLT::scalar(64);
2003   const LLT S32 = LLT::scalar(32);
2004 
2005   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2006 
2007   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2008 
2009   auto CvtHi = Signed ?
2010     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2011     B.buildUITOFP(S64, Unmerge.getReg(1));
2012 
2013   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2014 
2015   auto ThirtyTwo = B.buildConstant(S32, 32);
2016   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2017     .addUse(CvtHi.getReg(0))
2018     .addUse(ThirtyTwo.getReg(0));
2019 
2020   // TODO: Should this propagate fast-math-flags?
2021   B.buildFAdd(Dst, LdExp, CvtLo);
2022   MI.eraseFromParent();
2023   return true;
2024 }
2025 
2026 // TODO: Copied from DAG implementation. Verify logic and document how this
2027 // actually works.
2028 bool AMDGPULegalizerInfo::legalizeFPTOI(
2029   MachineInstr &MI, MachineRegisterInfo &MRI,
2030   MachineIRBuilder &B, bool Signed) const {
2031 
2032   Register Dst = MI.getOperand(0).getReg();
2033   Register Src = MI.getOperand(1).getReg();
2034 
2035   const LLT S64 = LLT::scalar(64);
2036   const LLT S32 = LLT::scalar(32);
2037 
2038   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2039 
2040   unsigned Flags = MI.getFlags();
2041 
2042   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2043   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2044   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2045 
2046   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2047   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2048   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2049 
2050   auto Hi = Signed ?
2051     B.buildFPTOSI(S32, FloorMul) :
2052     B.buildFPTOUI(S32, FloorMul);
2053   auto Lo = B.buildFPTOUI(S32, Fma);
2054 
2055   B.buildMerge(Dst, { Lo, Hi });
2056   MI.eraseFromParent();
2057 
2058   return true;
2059 }
2060 
2061 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2062                                                MachineInstr &MI) const {
2063   MachineFunction &MF = Helper.MIRBuilder.getMF();
2064   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2065 
2066   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2067                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2068 
2069   // With ieee_mode disabled, the instructions have the correct behavior
2070   // already for G_FMINNUM/G_FMAXNUM
2071   if (!MFI->getMode().IEEE)
2072     return !IsIEEEOp;
2073 
2074   if (IsIEEEOp)
2075     return true;
2076 
2077   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2081   MachineInstr &MI, MachineRegisterInfo &MRI,
2082   MachineIRBuilder &B) const {
2083   // TODO: Should move some of this into LegalizerHelper.
2084 
2085   // TODO: Promote dynamic indexing of s16 to s32
2086 
2087   // FIXME: Artifact combiner probably should have replaced the truncated
2088   // constant before this, so we shouldn't need
2089   // getConstantVRegValWithLookThrough.
2090   Optional<ValueAndVReg> MaybeIdxVal =
2091       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2092   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2093     return true;
2094   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Vec = MI.getOperand(1).getReg();
2098 
2099   LLT VecTy = MRI.getType(Vec);
2100   LLT EltTy = VecTy.getElementType();
2101   assert(EltTy == MRI.getType(Dst));
2102 
2103   if (IdxVal < VecTy.getNumElements())
2104     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2105   else
2106     B.buildUndef(Dst);
2107 
2108   MI.eraseFromParent();
2109   return true;
2110 }
2111 
2112 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2113   MachineInstr &MI, MachineRegisterInfo &MRI,
2114   MachineIRBuilder &B) const {
2115   // TODO: Should move some of this into LegalizerHelper.
2116 
2117   // TODO: Promote dynamic indexing of s16 to s32
2118 
2119   // FIXME: Artifact combiner probably should have replaced the truncated
2120   // constant before this, so we shouldn't need
2121   // getConstantVRegValWithLookThrough.
2122   Optional<ValueAndVReg> MaybeIdxVal =
2123       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2124   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2125     return true;
2126 
2127   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2128   Register Dst = MI.getOperand(0).getReg();
2129   Register Vec = MI.getOperand(1).getReg();
2130   Register Ins = MI.getOperand(2).getReg();
2131 
2132   LLT VecTy = MRI.getType(Vec);
2133   LLT EltTy = VecTy.getElementType();
2134   assert(EltTy == MRI.getType(Ins));
2135 
2136   if (IdxVal < VecTy.getNumElements())
2137     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2138   else
2139     B.buildUndef(Dst);
2140 
2141   MI.eraseFromParent();
2142   return true;
2143 }
2144 
2145 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2146   MachineInstr &MI, MachineRegisterInfo &MRI,
2147   MachineIRBuilder &B) const {
2148   const LLT V2S16 = LLT::vector(2, 16);
2149 
2150   Register Dst = MI.getOperand(0).getReg();
2151   Register Src0 = MI.getOperand(1).getReg();
2152   LLT DstTy = MRI.getType(Dst);
2153   LLT SrcTy = MRI.getType(Src0);
2154 
2155   if (SrcTy == V2S16 && DstTy == V2S16 &&
2156       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2157     return true;
2158 
2159   MachineIRBuilder HelperBuilder(MI);
2160   GISelObserverWrapper DummyObserver;
2161   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2162   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2163 }
2164 
2165 bool AMDGPULegalizerInfo::legalizeSinCos(
2166   MachineInstr &MI, MachineRegisterInfo &MRI,
2167   MachineIRBuilder &B) const {
2168 
2169   Register DstReg = MI.getOperand(0).getReg();
2170   Register SrcReg = MI.getOperand(1).getReg();
2171   LLT Ty = MRI.getType(DstReg);
2172   unsigned Flags = MI.getFlags();
2173 
2174   Register TrigVal;
2175   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2176   if (ST.hasTrigReducedRange()) {
2177     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2178     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2179       .addUse(MulVal.getReg(0))
2180       .setMIFlags(Flags).getReg(0);
2181   } else
2182     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2183 
2184   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2185     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2186   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2187     .addUse(TrigVal)
2188     .setMIFlags(Flags);
2189   MI.eraseFromParent();
2190   return true;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2194                                                   MachineIRBuilder &B,
2195                                                   const GlobalValue *GV,
2196                                                   int64_t Offset,
2197                                                   unsigned GAFlags) const {
2198   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2199   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2200   // to the following code sequence:
2201   //
2202   // For constant address space:
2203   //   s_getpc_b64 s[0:1]
2204   //   s_add_u32 s0, s0, $symbol
2205   //   s_addc_u32 s1, s1, 0
2206   //
2207   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2208   //   a fixup or relocation is emitted to replace $symbol with a literal
2209   //   constant, which is a pc-relative offset from the encoding of the $symbol
2210   //   operand to the global variable.
2211   //
2212   // For global address space:
2213   //   s_getpc_b64 s[0:1]
2214   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2215   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2216   //
2217   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2218   //   fixups or relocations are emitted to replace $symbol@*@lo and
2219   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2220   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2221   //   operand to the global variable.
2222   //
2223   // What we want here is an offset from the value returned by s_getpc
2224   // (which is the address of the s_add_u32 instruction) to the global
2225   // variable, but since the encoding of $symbol starts 4 bytes after the start
2226   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2227   // small. This requires us to add 4 to the global variable offset in order to
2228   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2229   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2230   // instruction.
2231 
2232   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2233 
2234   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2235     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2236 
2237   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2238     .addDef(PCReg);
2239 
2240   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2241   if (GAFlags == SIInstrInfo::MO_NONE)
2242     MIB.addImm(0);
2243   else
2244     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2245 
2246   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2247 
2248   if (PtrTy.getSizeInBits() == 32)
2249     B.buildExtract(DstReg, PCReg, 0);
2250   return true;
2251  }
2252 
2253 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2254   MachineInstr &MI, MachineRegisterInfo &MRI,
2255   MachineIRBuilder &B) const {
2256   Register DstReg = MI.getOperand(0).getReg();
2257   LLT Ty = MRI.getType(DstReg);
2258   unsigned AS = Ty.getAddressSpace();
2259 
2260   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2261   MachineFunction &MF = B.getMF();
2262   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2263 
2264   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2265     if (!MFI->isModuleEntryFunction()) {
2266       const Function &Fn = MF.getFunction();
2267       DiagnosticInfoUnsupported BadLDSDecl(
2268         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2269         DS_Warning);
2270       Fn.getContext().diagnose(BadLDSDecl);
2271 
2272       // We currently don't have a way to correctly allocate LDS objects that
2273       // aren't directly associated with a kernel. We do force inlining of
2274       // functions that use local objects. However, if these dead functions are
2275       // not eliminated, we don't want a compile time error. Just emit a warning
2276       // and a trap, since there should be no callable path here.
2277       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2278       B.buildUndef(DstReg);
2279       MI.eraseFromParent();
2280       return true;
2281     }
2282 
2283     // TODO: We could emit code to handle the initialization somewhere.
2284     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2285       const SITargetLowering *TLI = ST.getTargetLowering();
2286       if (!TLI->shouldUseLDSConstAddress(GV)) {
2287         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2288         return true; // Leave in place;
2289       }
2290 
2291       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2292         Type *Ty = GV->getValueType();
2293         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2294         // zero-sized type in other languages to declare the dynamic shared
2295         // memory which size is not known at the compile time. They will be
2296         // allocated by the runtime and placed directly after the static
2297         // allocated ones. They all share the same offset.
2298         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2299           // Adjust alignment for that dynamic shared memory array.
2300           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2301           LLT S32 = LLT::scalar(32);
2302           auto Sz =
2303               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2304           B.buildIntToPtr(DstReg, Sz);
2305           MI.eraseFromParent();
2306           return true;
2307         }
2308       }
2309 
2310       B.buildConstant(
2311           DstReg,
2312           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2313       MI.eraseFromParent();
2314       return true;
2315     }
2316 
2317     const Function &Fn = MF.getFunction();
2318     DiagnosticInfoUnsupported BadInit(
2319       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2320     Fn.getContext().diagnose(BadInit);
2321     return true;
2322   }
2323 
2324   const SITargetLowering *TLI = ST.getTargetLowering();
2325 
2326   if (TLI->shouldEmitFixup(GV)) {
2327     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2328     MI.eraseFromParent();
2329     return true;
2330   }
2331 
2332   if (TLI->shouldEmitPCReloc(GV)) {
2333     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2334     MI.eraseFromParent();
2335     return true;
2336   }
2337 
2338   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2339   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2340 
2341   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2342       MachinePointerInfo::getGOT(MF),
2343       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2344           MachineMemOperand::MOInvariant,
2345       8 /*Size*/, Align(8));
2346 
2347   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2348 
2349   if (Ty.getSizeInBits() == 32) {
2350     // Truncate if this is a 32-bit constant adrdess.
2351     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2352     B.buildExtract(DstReg, Load, 0);
2353   } else
2354     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2355 
2356   MI.eraseFromParent();
2357   return true;
2358 }
2359 
2360 static LLT widenToNextPowerOf2(LLT Ty) {
2361   if (Ty.isVector())
2362     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2363   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2364 }
2365 
2366 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2367                                        MachineInstr &MI) const {
2368   MachineIRBuilder &B = Helper.MIRBuilder;
2369   MachineRegisterInfo &MRI = *B.getMRI();
2370   GISelChangeObserver &Observer = Helper.Observer;
2371 
2372   Register PtrReg = MI.getOperand(1).getReg();
2373   LLT PtrTy = MRI.getType(PtrReg);
2374   unsigned AddrSpace = PtrTy.getAddressSpace();
2375 
2376   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2377     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2378     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2379     Observer.changingInstr(MI);
2380     MI.getOperand(1).setReg(Cast.getReg(0));
2381     Observer.changedInstr(MI);
2382     return true;
2383   }
2384 
2385   Register ValReg = MI.getOperand(0).getReg();
2386   LLT ValTy = MRI.getType(ValReg);
2387 
2388   MachineMemOperand *MMO = *MI.memoperands_begin();
2389   const unsigned ValSize = ValTy.getSizeInBits();
2390   const unsigned MemSize = 8 * MMO->getSize();
2391   const Align MemAlign = MMO->getAlign();
2392   const unsigned AlignInBits = 8 * MemAlign.value();
2393 
2394   // Widen non-power-of-2 loads to the alignment if needed
2395   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2396     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2397 
2398     // This was already the correct extending load result type, so just adjust
2399     // the memory type.
2400     if (WideMemSize == ValSize) {
2401       MachineFunction &MF = B.getMF();
2402 
2403       MachineMemOperand *WideMMO =
2404           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2405       Observer.changingInstr(MI);
2406       MI.setMemRefs(MF, {WideMMO});
2407       Observer.changedInstr(MI);
2408       return true;
2409     }
2410 
2411     // Don't bother handling edge case that should probably never be produced.
2412     if (ValSize > WideMemSize)
2413       return false;
2414 
2415     LLT WideTy = widenToNextPowerOf2(ValTy);
2416 
2417     Register WideLoad;
2418     if (!WideTy.isVector()) {
2419       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2420       B.buildTrunc(ValReg, WideLoad).getReg(0);
2421     } else {
2422       // Extract the subvector.
2423 
2424       if (isRegisterType(ValTy)) {
2425         // If this a case where G_EXTRACT is legal, use it.
2426         // (e.g. <3 x s32> -> <4 x s32>)
2427         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2428         B.buildExtract(ValReg, WideLoad, 0);
2429       } else {
2430         // For cases where the widened type isn't a nice register value, unmerge
2431         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2432         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2433         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2434         B.setInsertPt(B.getMBB(), MI.getIterator());
2435         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2436       }
2437     }
2438 
2439     MI.eraseFromParent();
2440     return true;
2441   }
2442 
2443   return false;
2444 }
2445 
2446 bool AMDGPULegalizerInfo::legalizeFMad(
2447   MachineInstr &MI, MachineRegisterInfo &MRI,
2448   MachineIRBuilder &B) const {
2449   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2450   assert(Ty.isScalar());
2451 
2452   MachineFunction &MF = B.getMF();
2453   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2454 
2455   // TODO: Always legal with future ftz flag.
2456   // FIXME: Do we need just output?
2457   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2458     return true;
2459   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2460     return true;
2461 
2462   MachineIRBuilder HelperBuilder(MI);
2463   GISelObserverWrapper DummyObserver;
2464   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2465   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2466 }
2467 
2468 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2469   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2470   Register DstReg = MI.getOperand(0).getReg();
2471   Register PtrReg = MI.getOperand(1).getReg();
2472   Register CmpVal = MI.getOperand(2).getReg();
2473   Register NewVal = MI.getOperand(3).getReg();
2474 
2475   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2476          "this should not have been custom lowered");
2477 
2478   LLT ValTy = MRI.getType(CmpVal);
2479   LLT VecTy = LLT::vector(2, ValTy);
2480 
2481   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2482 
2483   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2484     .addDef(DstReg)
2485     .addUse(PtrReg)
2486     .addUse(PackedVal)
2487     .setMemRefs(MI.memoperands());
2488 
2489   MI.eraseFromParent();
2490   return true;
2491 }
2492 
2493 bool AMDGPULegalizerInfo::legalizeFlog(
2494   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2495   Register Dst = MI.getOperand(0).getReg();
2496   Register Src = MI.getOperand(1).getReg();
2497   LLT Ty = B.getMRI()->getType(Dst);
2498   unsigned Flags = MI.getFlags();
2499 
2500   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2501   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2502 
2503   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2504   MI.eraseFromParent();
2505   return true;
2506 }
2507 
2508 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2509                                        MachineIRBuilder &B) const {
2510   Register Dst = MI.getOperand(0).getReg();
2511   Register Src = MI.getOperand(1).getReg();
2512   unsigned Flags = MI.getFlags();
2513   LLT Ty = B.getMRI()->getType(Dst);
2514 
2515   auto K = B.buildFConstant(Ty, numbers::log2e);
2516   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2517   B.buildFExp2(Dst, Mul, Flags);
2518   MI.eraseFromParent();
2519   return true;
2520 }
2521 
2522 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2523                                        MachineIRBuilder &B) const {
2524   Register Dst = MI.getOperand(0).getReg();
2525   Register Src0 = MI.getOperand(1).getReg();
2526   Register Src1 = MI.getOperand(2).getReg();
2527   unsigned Flags = MI.getFlags();
2528   LLT Ty = B.getMRI()->getType(Dst);
2529   const LLT S16 = LLT::scalar(16);
2530   const LLT S32 = LLT::scalar(32);
2531 
2532   if (Ty == S32) {
2533     auto Log = B.buildFLog2(S32, Src0, Flags);
2534     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2535       .addUse(Log.getReg(0))
2536       .addUse(Src1)
2537       .setMIFlags(Flags);
2538     B.buildFExp2(Dst, Mul, Flags);
2539   } else if (Ty == S16) {
2540     // There's no f16 fmul_legacy, so we need to convert for it.
2541     auto Log = B.buildFLog2(S16, Src0, Flags);
2542     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2543     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2544     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2545       .addUse(Ext0.getReg(0))
2546       .addUse(Ext1.getReg(0))
2547       .setMIFlags(Flags);
2548 
2549     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2550   } else
2551     return false;
2552 
2553   MI.eraseFromParent();
2554   return true;
2555 }
2556 
2557 // Find a source register, ignoring any possible source modifiers.
2558 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2559   Register ModSrc = OrigSrc;
2560   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2561     ModSrc = SrcFNeg->getOperand(1).getReg();
2562     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2563       ModSrc = SrcFAbs->getOperand(1).getReg();
2564   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2565     ModSrc = SrcFAbs->getOperand(1).getReg();
2566   return ModSrc;
2567 }
2568 
2569 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2570                                          MachineRegisterInfo &MRI,
2571                                          MachineIRBuilder &B) const {
2572 
2573   const LLT S1 = LLT::scalar(1);
2574   const LLT S64 = LLT::scalar(64);
2575   Register Dst = MI.getOperand(0).getReg();
2576   Register OrigSrc = MI.getOperand(1).getReg();
2577   unsigned Flags = MI.getFlags();
2578   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2579          "this should not have been custom lowered");
2580 
2581   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2582   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2583   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2584   // V_FRACT bug is:
2585   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2586   //
2587   // Convert floor(x) to (x - fract(x))
2588 
2589   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2590     .addUse(OrigSrc)
2591     .setMIFlags(Flags);
2592 
2593   // Give source modifier matching some assistance before obscuring a foldable
2594   // pattern.
2595 
2596   // TODO: We can avoid the neg on the fract? The input sign to fract
2597   // shouldn't matter?
2598   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2599 
2600   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2601 
2602   Register Min = MRI.createGenericVirtualRegister(S64);
2603 
2604   // We don't need to concern ourselves with the snan handling difference, so
2605   // use the one which will directly select.
2606   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2607   if (MFI->getMode().IEEE)
2608     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2609   else
2610     B.buildFMinNum(Min, Fract, Const, Flags);
2611 
2612   Register CorrectedFract = Min;
2613   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2614     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2615     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2616   }
2617 
2618   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2619   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2620 
2621   MI.eraseFromParent();
2622   return true;
2623 }
2624 
2625 // Turn an illegal packed v2s16 build vector into bit operations.
2626 // TODO: This should probably be a bitcast action in LegalizerHelper.
2627 bool AMDGPULegalizerInfo::legalizeBuildVector(
2628   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2629   Register Dst = MI.getOperand(0).getReg();
2630   const LLT S32 = LLT::scalar(32);
2631   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2632 
2633   Register Src0 = MI.getOperand(1).getReg();
2634   Register Src1 = MI.getOperand(2).getReg();
2635   assert(MRI.getType(Src0) == LLT::scalar(16));
2636 
2637   auto Merge = B.buildMerge(S32, {Src0, Src1});
2638   B.buildBitcast(Dst, Merge);
2639 
2640   MI.eraseFromParent();
2641   return true;
2642 }
2643 
2644 // Check that this is a G_XOR x, -1
2645 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2646   if (MI.getOpcode() != TargetOpcode::G_XOR)
2647     return false;
2648   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2649   return ConstVal && *ConstVal == -1;
2650 }
2651 
2652 // Return the use branch instruction, otherwise null if the usage is invalid.
2653 static MachineInstr *
2654 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2655                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2656   Register CondDef = MI.getOperand(0).getReg();
2657   if (!MRI.hasOneNonDBGUse(CondDef))
2658     return nullptr;
2659 
2660   MachineBasicBlock *Parent = MI.getParent();
2661   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2662 
2663   if (isNot(MRI, *UseMI)) {
2664     Register NegatedCond = UseMI->getOperand(0).getReg();
2665     if (!MRI.hasOneNonDBGUse(NegatedCond))
2666       return nullptr;
2667 
2668     // We're deleting the def of this value, so we need to remove it.
2669     UseMI->eraseFromParent();
2670 
2671     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2672     Negated = true;
2673   }
2674 
2675   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2676     return nullptr;
2677 
2678   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2679   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2680   if (Next == Parent->end()) {
2681     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2682     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2683       return nullptr;
2684     UncondBrTarget = &*NextMBB;
2685   } else {
2686     if (Next->getOpcode() != AMDGPU::G_BR)
2687       return nullptr;
2688     Br = &*Next;
2689     UncondBrTarget = Br->getOperand(0).getMBB();
2690   }
2691 
2692   return UseMI;
2693 }
2694 
2695 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2696                                          const ArgDescriptor *Arg,
2697                                          const TargetRegisterClass *ArgRC,
2698                                          LLT ArgTy) const {
2699   MCRegister SrcReg = Arg->getRegister();
2700   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2701   assert(DstReg.isVirtual() && "Virtual register expected");
2702 
2703   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2704                                              ArgTy);
2705   if (Arg->isMasked()) {
2706     // TODO: Should we try to emit this once in the entry block?
2707     const LLT S32 = LLT::scalar(32);
2708     const unsigned Mask = Arg->getMask();
2709     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2710 
2711     Register AndMaskSrc = LiveIn;
2712 
2713     if (Shift != 0) {
2714       auto ShiftAmt = B.buildConstant(S32, Shift);
2715       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2716     }
2717 
2718     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2719   } else {
2720     B.buildCopy(DstReg, LiveIn);
2721   }
2722 
2723   return true;
2724 }
2725 
2726 bool AMDGPULegalizerInfo::loadInputValue(
2727     Register DstReg, MachineIRBuilder &B,
2728     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2729   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2730   const ArgDescriptor *Arg;
2731   const TargetRegisterClass *ArgRC;
2732   LLT ArgTy;
2733   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2734 
2735   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2736     return false; // TODO: Handle these
2737   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2738 }
2739 
2740 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2741     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2742     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2743   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2744     return false;
2745 
2746   MI.eraseFromParent();
2747   return true;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2751                                        MachineRegisterInfo &MRI,
2752                                        MachineIRBuilder &B) const {
2753   Register Dst = MI.getOperand(0).getReg();
2754   LLT DstTy = MRI.getType(Dst);
2755   LLT S16 = LLT::scalar(16);
2756   LLT S32 = LLT::scalar(32);
2757   LLT S64 = LLT::scalar(64);
2758 
2759   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2760     return true;
2761 
2762   if (DstTy == S16)
2763     return legalizeFDIV16(MI, MRI, B);
2764   if (DstTy == S32)
2765     return legalizeFDIV32(MI, MRI, B);
2766   if (DstTy == S64)
2767     return legalizeFDIV64(MI, MRI, B);
2768 
2769   return false;
2770 }
2771 
2772 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2773                                                   Register DstReg,
2774                                                   Register X,
2775                                                   Register Y,
2776                                                   bool IsDiv) const {
2777   const LLT S1 = LLT::scalar(1);
2778   const LLT S32 = LLT::scalar(32);
2779 
2780   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2781   // algorithm used here.
2782 
2783   // Initial estimate of inv(y).
2784   auto FloatY = B.buildUITOFP(S32, Y);
2785   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2786   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2787   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2788   auto Z = B.buildFPTOUI(S32, ScaledY);
2789 
2790   // One round of UNR.
2791   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2792   auto NegYZ = B.buildMul(S32, NegY, Z);
2793   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2794 
2795   // Quotient/remainder estimate.
2796   auto Q = B.buildUMulH(S32, X, Z);
2797   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2798 
2799   // First quotient/remainder refinement.
2800   auto One = B.buildConstant(S32, 1);
2801   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2802   if (IsDiv)
2803     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2804   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2805 
2806   // Second quotient/remainder refinement.
2807   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2808   if (IsDiv)
2809     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2810   else
2811     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2812 }
2813 
2814 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2815                                               MachineRegisterInfo &MRI,
2816                                               MachineIRBuilder &B) const {
2817   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2818   Register DstReg = MI.getOperand(0).getReg();
2819   Register Num = MI.getOperand(1).getReg();
2820   Register Den = MI.getOperand(2).getReg();
2821   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2822   MI.eraseFromParent();
2823   return true;
2824 }
2825 
2826 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2827 //
2828 // Return lo, hi of result
2829 //
2830 // %cvt.lo = G_UITOFP Val.lo
2831 // %cvt.hi = G_UITOFP Val.hi
2832 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2833 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2834 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2835 // %mul2 = G_FMUL %mul1, 2**(-32)
2836 // %trunc = G_INTRINSIC_TRUNC %mul2
2837 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2838 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2839 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2840                                                        Register Val) {
2841   const LLT S32 = LLT::scalar(32);
2842   auto Unmerge = B.buildUnmerge(S32, Val);
2843 
2844   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2845   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2846 
2847   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2848                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2849 
2850   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2851   auto Mul1 =
2852       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2853 
2854   // 2**(-32)
2855   auto Mul2 =
2856       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2857   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2858 
2859   // -(2**32)
2860   auto Mad2 = B.buildFMAD(S32, Trunc,
2861                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2862 
2863   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2864   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2865 
2866   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2867 }
2868 
2869 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2870                                                   Register DstReg,
2871                                                   Register Numer,
2872                                                   Register Denom,
2873                                                   bool IsDiv) const {
2874   const LLT S32 = LLT::scalar(32);
2875   const LLT S64 = LLT::scalar(64);
2876   const LLT S1 = LLT::scalar(1);
2877   Register RcpLo, RcpHi;
2878 
2879   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2880 
2881   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2882 
2883   auto Zero64 = B.buildConstant(S64, 0);
2884   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2885 
2886   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2887   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2888 
2889   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2890   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2891   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2892 
2893   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2894   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2895   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2896   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2897 
2898   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2899   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2900   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2901   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2902   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2903 
2904   auto Zero32 = B.buildConstant(S32, 0);
2905   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2906   auto Add2_HiC =
2907       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2908   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2909   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2910 
2911   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2912   Register NumerLo = UnmergeNumer.getReg(0);
2913   Register NumerHi = UnmergeNumer.getReg(1);
2914 
2915   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2916   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2917   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2918   Register Mul3_Lo = UnmergeMul3.getReg(0);
2919   Register Mul3_Hi = UnmergeMul3.getReg(1);
2920   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2921   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2922   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2923   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2924 
2925   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2926   Register DenomLo = UnmergeDenom.getReg(0);
2927   Register DenomHi = UnmergeDenom.getReg(1);
2928 
2929   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2930   auto C1 = B.buildSExt(S32, CmpHi);
2931 
2932   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2933   auto C2 = B.buildSExt(S32, CmpLo);
2934 
2935   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2936   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2937 
2938   // TODO: Here and below portions of the code can be enclosed into if/endif.
2939   // Currently control flow is unconditional and we have 4 selects after
2940   // potential endif to substitute PHIs.
2941 
2942   // if C3 != 0 ...
2943   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2944   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2945   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2946   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2947 
2948   auto One64 = B.buildConstant(S64, 1);
2949   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2950 
2951   auto C4 =
2952       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2953   auto C5 =
2954       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2955   auto C6 = B.buildSelect(
2956       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2957 
2958   // if (C6 != 0)
2959   auto Add4 = B.buildAdd(S64, Add3, One64);
2960   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2961 
2962   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2963   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2964   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2965 
2966   // endif C6
2967   // endif C3
2968 
2969   if (IsDiv) {
2970     auto Sel1 = B.buildSelect(
2971         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2972     B.buildSelect(DstReg,
2973                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2974   } else {
2975     auto Sel2 = B.buildSelect(
2976         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2977     B.buildSelect(DstReg,
2978                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2979   }
2980 }
2981 
2982 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2983                                             MachineRegisterInfo &MRI,
2984                                             MachineIRBuilder &B) const {
2985   const LLT S64 = LLT::scalar(64);
2986   const LLT S32 = LLT::scalar(32);
2987   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2988   Register DstReg = MI.getOperand(0).getReg();
2989   Register Num = MI.getOperand(1).getReg();
2990   Register Den = MI.getOperand(2).getReg();
2991   LLT Ty = MRI.getType(DstReg);
2992 
2993   if (Ty == S32)
2994     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2995   else if (Ty == S64)
2996     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2997   else
2998     return false;
2999 
3000   MI.eraseFromParent();
3001   return true;
3002 
3003 }
3004 
3005 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3006                                             MachineRegisterInfo &MRI,
3007                                             MachineIRBuilder &B) const {
3008   const LLT S64 = LLT::scalar(64);
3009   const LLT S32 = LLT::scalar(32);
3010 
3011   Register DstReg = MI.getOperand(0).getReg();
3012   const LLT Ty = MRI.getType(DstReg);
3013   if (Ty != S32 && Ty != S64)
3014     return false;
3015 
3016   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3017 
3018   Register LHS = MI.getOperand(1).getReg();
3019   Register RHS = MI.getOperand(2).getReg();
3020 
3021   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3022   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3023   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3024 
3025   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3026   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3027 
3028   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3029   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3030 
3031   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3032   if (Ty == S32)
3033     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3034   else
3035     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3036 
3037   Register Sign;
3038   if (IsDiv)
3039     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3040   else
3041     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3042 
3043   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3044   B.buildSub(DstReg, UDivRem, Sign);
3045 
3046   MI.eraseFromParent();
3047   return true;
3048 }
3049 
3050 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3051                                                  MachineRegisterInfo &MRI,
3052                                                  MachineIRBuilder &B) const {
3053   Register Res = MI.getOperand(0).getReg();
3054   Register LHS = MI.getOperand(1).getReg();
3055   Register RHS = MI.getOperand(2).getReg();
3056   uint16_t Flags = MI.getFlags();
3057   LLT ResTy = MRI.getType(Res);
3058 
3059   const MachineFunction &MF = B.getMF();
3060   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3061                             MI.getFlag(MachineInstr::FmAfn);
3062 
3063   if (!AllowInaccurateRcp)
3064     return false;
3065 
3066   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3067     // 1 / x -> RCP(x)
3068     if (CLHS->isExactlyValue(1.0)) {
3069       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3070         .addUse(RHS)
3071         .setMIFlags(Flags);
3072 
3073       MI.eraseFromParent();
3074       return true;
3075     }
3076 
3077     // -1 / x -> RCP( FNEG(x) )
3078     if (CLHS->isExactlyValue(-1.0)) {
3079       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3080       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3081         .addUse(FNeg.getReg(0))
3082         .setMIFlags(Flags);
3083 
3084       MI.eraseFromParent();
3085       return true;
3086     }
3087   }
3088 
3089   // x / y -> x * (1.0 / y)
3090   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3091     .addUse(RHS)
3092     .setMIFlags(Flags);
3093   B.buildFMul(Res, LHS, RCP, Flags);
3094 
3095   MI.eraseFromParent();
3096   return true;
3097 }
3098 
3099 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3100                                          MachineRegisterInfo &MRI,
3101                                          MachineIRBuilder &B) const {
3102   Register Res = MI.getOperand(0).getReg();
3103   Register LHS = MI.getOperand(1).getReg();
3104   Register RHS = MI.getOperand(2).getReg();
3105 
3106   uint16_t Flags = MI.getFlags();
3107 
3108   LLT S16 = LLT::scalar(16);
3109   LLT S32 = LLT::scalar(32);
3110 
3111   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3112   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3113 
3114   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3115     .addUse(RHSExt.getReg(0))
3116     .setMIFlags(Flags);
3117 
3118   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3119   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3120 
3121   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3122     .addUse(RDst.getReg(0))
3123     .addUse(RHS)
3124     .addUse(LHS)
3125     .setMIFlags(Flags);
3126 
3127   MI.eraseFromParent();
3128   return true;
3129 }
3130 
3131 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3132 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3133 static void toggleSPDenormMode(bool Enable,
3134                                MachineIRBuilder &B,
3135                                const GCNSubtarget &ST,
3136                                AMDGPU::SIModeRegisterDefaults Mode) {
3137   // Set SP denorm mode to this value.
3138   unsigned SPDenormMode =
3139     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3140 
3141   if (ST.hasDenormModeInst()) {
3142     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3143     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3144 
3145     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3146     B.buildInstr(AMDGPU::S_DENORM_MODE)
3147       .addImm(NewDenormModeValue);
3148 
3149   } else {
3150     // Select FP32 bit field in mode register.
3151     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3152                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3153                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3154 
3155     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3156       .addImm(SPDenormMode)
3157       .addImm(SPDenormModeBitField);
3158   }
3159 }
3160 
3161 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3162                                          MachineRegisterInfo &MRI,
3163                                          MachineIRBuilder &B) const {
3164   Register Res = MI.getOperand(0).getReg();
3165   Register LHS = MI.getOperand(1).getReg();
3166   Register RHS = MI.getOperand(2).getReg();
3167   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3168   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3169 
3170   uint16_t Flags = MI.getFlags();
3171 
3172   LLT S32 = LLT::scalar(32);
3173   LLT S1 = LLT::scalar(1);
3174 
3175   auto One = B.buildFConstant(S32, 1.0f);
3176 
3177   auto DenominatorScaled =
3178     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3179       .addUse(LHS)
3180       .addUse(RHS)
3181       .addImm(0)
3182       .setMIFlags(Flags);
3183   auto NumeratorScaled =
3184     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3185       .addUse(LHS)
3186       .addUse(RHS)
3187       .addImm(1)
3188       .setMIFlags(Flags);
3189 
3190   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3191     .addUse(DenominatorScaled.getReg(0))
3192     .setMIFlags(Flags);
3193   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3194 
3195   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3196   // aren't modeled as reading it.
3197   if (!Mode.allFP32Denormals())
3198     toggleSPDenormMode(true, B, ST, Mode);
3199 
3200   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3201   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3202   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3203   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3204   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3205   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3206 
3207   if (!Mode.allFP32Denormals())
3208     toggleSPDenormMode(false, B, ST, Mode);
3209 
3210   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3211     .addUse(Fma4.getReg(0))
3212     .addUse(Fma1.getReg(0))
3213     .addUse(Fma3.getReg(0))
3214     .addUse(NumeratorScaled.getReg(1))
3215     .setMIFlags(Flags);
3216 
3217   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3218     .addUse(Fmas.getReg(0))
3219     .addUse(RHS)
3220     .addUse(LHS)
3221     .setMIFlags(Flags);
3222 
3223   MI.eraseFromParent();
3224   return true;
3225 }
3226 
3227 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3228                                          MachineRegisterInfo &MRI,
3229                                          MachineIRBuilder &B) const {
3230   Register Res = MI.getOperand(0).getReg();
3231   Register LHS = MI.getOperand(1).getReg();
3232   Register RHS = MI.getOperand(2).getReg();
3233 
3234   uint16_t Flags = MI.getFlags();
3235 
3236   LLT S64 = LLT::scalar(64);
3237   LLT S1 = LLT::scalar(1);
3238 
3239   auto One = B.buildFConstant(S64, 1.0);
3240 
3241   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3242     .addUse(LHS)
3243     .addUse(RHS)
3244     .addImm(0)
3245     .setMIFlags(Flags);
3246 
3247   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3248 
3249   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3250     .addUse(DivScale0.getReg(0))
3251     .setMIFlags(Flags);
3252 
3253   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3254   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3255   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3256 
3257   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3258     .addUse(LHS)
3259     .addUse(RHS)
3260     .addImm(1)
3261     .setMIFlags(Flags);
3262 
3263   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3264   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3265   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3266 
3267   Register Scale;
3268   if (!ST.hasUsableDivScaleConditionOutput()) {
3269     // Workaround a hardware bug on SI where the condition output from div_scale
3270     // is not usable.
3271 
3272     LLT S32 = LLT::scalar(32);
3273 
3274     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3275     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3276     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3277     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3278 
3279     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3280                               Scale1Unmerge.getReg(1));
3281     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3282                               Scale0Unmerge.getReg(1));
3283     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3284   } else {
3285     Scale = DivScale1.getReg(1);
3286   }
3287 
3288   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3289     .addUse(Fma4.getReg(0))
3290     .addUse(Fma3.getReg(0))
3291     .addUse(Mul.getReg(0))
3292     .addUse(Scale)
3293     .setMIFlags(Flags);
3294 
3295   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3296     .addUse(Fmas.getReg(0))
3297     .addUse(RHS)
3298     .addUse(LHS)
3299     .setMIFlags(Flags);
3300 
3301   MI.eraseFromParent();
3302   return true;
3303 }
3304 
3305 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3306                                                  MachineRegisterInfo &MRI,
3307                                                  MachineIRBuilder &B) const {
3308   Register Res = MI.getOperand(0).getReg();
3309   Register LHS = MI.getOperand(2).getReg();
3310   Register RHS = MI.getOperand(3).getReg();
3311   uint16_t Flags = MI.getFlags();
3312 
3313   LLT S32 = LLT::scalar(32);
3314   LLT S1 = LLT::scalar(1);
3315 
3316   auto Abs = B.buildFAbs(S32, RHS, Flags);
3317   const APFloat C0Val(1.0f);
3318 
3319   auto C0 = B.buildConstant(S32, 0x6f800000);
3320   auto C1 = B.buildConstant(S32, 0x2f800000);
3321   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3322 
3323   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3324   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3325 
3326   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3327 
3328   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3329     .addUse(Mul0.getReg(0))
3330     .setMIFlags(Flags);
3331 
3332   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3333 
3334   B.buildFMul(Res, Sel, Mul1, Flags);
3335 
3336   MI.eraseFromParent();
3337   return true;
3338 }
3339 
3340 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3341 // FIXME: Why do we handle this one but not other removed instructions?
3342 //
3343 // Reciprocal square root.  The clamp prevents infinite results, clamping
3344 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3345 // +-max_float.
3346 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3347                                                     MachineRegisterInfo &MRI,
3348                                                     MachineIRBuilder &B) const {
3349   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3350     return true;
3351 
3352   Register Dst = MI.getOperand(0).getReg();
3353   Register Src = MI.getOperand(2).getReg();
3354   auto Flags = MI.getFlags();
3355 
3356   LLT Ty = MRI.getType(Dst);
3357 
3358   const fltSemantics *FltSemantics;
3359   if (Ty == LLT::scalar(32))
3360     FltSemantics = &APFloat::IEEEsingle();
3361   else if (Ty == LLT::scalar(64))
3362     FltSemantics = &APFloat::IEEEdouble();
3363   else
3364     return false;
3365 
3366   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3367     .addUse(Src)
3368     .setMIFlags(Flags);
3369 
3370   // We don't need to concern ourselves with the snan handling difference, since
3371   // the rsq quieted (or not) so use the one which will directly select.
3372   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3373   const bool UseIEEE = MFI->getMode().IEEE;
3374 
3375   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3376   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3377                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3378 
3379   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3380 
3381   if (UseIEEE)
3382     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3383   else
3384     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3385   MI.eraseFromParent();
3386   return true;
3387 }
3388 
3389 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3390   switch (IID) {
3391   case Intrinsic::amdgcn_ds_fadd:
3392     return AMDGPU::G_ATOMICRMW_FADD;
3393   case Intrinsic::amdgcn_ds_fmin:
3394     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3395   case Intrinsic::amdgcn_ds_fmax:
3396     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3397   default:
3398     llvm_unreachable("not a DS FP intrinsic");
3399   }
3400 }
3401 
3402 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3403                                                       MachineInstr &MI,
3404                                                       Intrinsic::ID IID) const {
3405   GISelChangeObserver &Observer = Helper.Observer;
3406   Observer.changingInstr(MI);
3407 
3408   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3409 
3410   // The remaining operands were used to set fields in the MemOperand on
3411   // construction.
3412   for (int I = 6; I > 3; --I)
3413     MI.RemoveOperand(I);
3414 
3415   MI.RemoveOperand(1); // Remove the intrinsic ID.
3416   Observer.changedInstr(MI);
3417   return true;
3418 }
3419 
3420 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3421                                             MachineRegisterInfo &MRI,
3422                                             MachineIRBuilder &B) const {
3423   uint64_t Offset =
3424     ST.getTargetLowering()->getImplicitParameterOffset(
3425       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3426   LLT DstTy = MRI.getType(DstReg);
3427   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3428 
3429   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3430   if (!loadInputValue(KernargPtrReg, B,
3431                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3432     return false;
3433 
3434   // FIXME: This should be nuw
3435   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3436   return true;
3437 }
3438 
3439 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3440                                                  MachineRegisterInfo &MRI,
3441                                                  MachineIRBuilder &B) const {
3442   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3443   if (!MFI->isEntryFunction()) {
3444     return legalizePreloadedArgIntrin(MI, MRI, B,
3445                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3446   }
3447 
3448   Register DstReg = MI.getOperand(0).getReg();
3449   if (!getImplicitArgPtr(DstReg, MRI, B))
3450     return false;
3451 
3452   MI.eraseFromParent();
3453   return true;
3454 }
3455 
3456 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3457                                               MachineRegisterInfo &MRI,
3458                                               MachineIRBuilder &B,
3459                                               unsigned AddrSpace) const {
3460   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3461   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3462   Register Hi32 = Unmerge.getReg(1);
3463 
3464   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3465   MI.eraseFromParent();
3466   return true;
3467 }
3468 
3469 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3470 // offset (the offset that is included in bounds checking and swizzling, to be
3471 // split between the instruction's voffset and immoffset fields) and soffset
3472 // (the offset that is excluded from bounds checking and swizzling, to go in
3473 // the instruction's soffset field).  This function takes the first kind of
3474 // offset and figures out how to split it between voffset and immoffset.
3475 std::tuple<Register, unsigned, unsigned>
3476 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3477                                         Register OrigOffset) const {
3478   const unsigned MaxImm = 4095;
3479   Register BaseReg;
3480   unsigned TotalConstOffset;
3481   const LLT S32 = LLT::scalar(32);
3482 
3483   std::tie(BaseReg, TotalConstOffset) =
3484       AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3485 
3486   unsigned ImmOffset = TotalConstOffset;
3487 
3488   // If the immediate value is too big for the immoffset field, put the value
3489   // and -4096 into the immoffset field so that the value that is copied/added
3490   // for the voffset field is a multiple of 4096, and it stands more chance
3491   // of being CSEd with the copy/add for another similar load/store.
3492   // However, do not do that rounding down to a multiple of 4096 if that is a
3493   // negative number, as it appears to be illegal to have a negative offset
3494   // in the vgpr, even if adding the immediate offset makes it positive.
3495   unsigned Overflow = ImmOffset & ~MaxImm;
3496   ImmOffset -= Overflow;
3497   if ((int32_t)Overflow < 0) {
3498     Overflow += ImmOffset;
3499     ImmOffset = 0;
3500   }
3501 
3502   if (Overflow != 0) {
3503     if (!BaseReg) {
3504       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3505     } else {
3506       auto OverflowVal = B.buildConstant(S32, Overflow);
3507       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3508     }
3509   }
3510 
3511   if (!BaseReg)
3512     BaseReg = B.buildConstant(S32, 0).getReg(0);
3513 
3514   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3515 }
3516 
3517 /// Handle register layout difference for f16 images for some subtargets.
3518 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3519                                              MachineRegisterInfo &MRI,
3520                                              Register Reg,
3521                                              bool ImageStore) const {
3522   const LLT S16 = LLT::scalar(16);
3523   const LLT S32 = LLT::scalar(32);
3524   LLT StoreVT = MRI.getType(Reg);
3525   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3526 
3527   if (ST.hasUnpackedD16VMem()) {
3528     auto Unmerge = B.buildUnmerge(S16, Reg);
3529 
3530     SmallVector<Register, 4> WideRegs;
3531     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3532       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3533 
3534     int NumElts = StoreVT.getNumElements();
3535 
3536     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3537   }
3538 
3539   if (ImageStore && ST.hasImageStoreD16Bug()) {
3540     if (StoreVT.getNumElements() == 2) {
3541       SmallVector<Register, 4> PackedRegs;
3542       Reg = B.buildBitcast(S32, Reg).getReg(0);
3543       PackedRegs.push_back(Reg);
3544       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3545       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3546     }
3547 
3548     if (StoreVT.getNumElements() == 3) {
3549       SmallVector<Register, 4> PackedRegs;
3550       auto Unmerge = B.buildUnmerge(S16, Reg);
3551       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3552         PackedRegs.push_back(Unmerge.getReg(I));
3553       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3554       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3555       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3556     }
3557 
3558     if (StoreVT.getNumElements() == 4) {
3559       SmallVector<Register, 4> PackedRegs;
3560       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3561       auto Unmerge = B.buildUnmerge(S32, Reg);
3562       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3563         PackedRegs.push_back(Unmerge.getReg(I));
3564       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3565       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3566     }
3567 
3568     llvm_unreachable("invalid data type");
3569   }
3570 
3571   return Reg;
3572 }
3573 
3574 Register AMDGPULegalizerInfo::fixStoreSourceType(
3575   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3576   MachineRegisterInfo *MRI = B.getMRI();
3577   LLT Ty = MRI->getType(VData);
3578 
3579   const LLT S16 = LLT::scalar(16);
3580 
3581   // Fixup illegal register types for i8 stores.
3582   if (Ty == LLT::scalar(8) || Ty == S16) {
3583     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3584     return AnyExt;
3585   }
3586 
3587   if (Ty.isVector()) {
3588     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3589       if (IsFormat)
3590         return handleD16VData(B, *MRI, VData);
3591     }
3592   }
3593 
3594   return VData;
3595 }
3596 
3597 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3598                                               MachineRegisterInfo &MRI,
3599                                               MachineIRBuilder &B,
3600                                               bool IsTyped,
3601                                               bool IsFormat) const {
3602   Register VData = MI.getOperand(1).getReg();
3603   LLT Ty = MRI.getType(VData);
3604   LLT EltTy = Ty.getScalarType();
3605   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3606   const LLT S32 = LLT::scalar(32);
3607 
3608   VData = fixStoreSourceType(B, VData, IsFormat);
3609   Register RSrc = MI.getOperand(2).getReg();
3610 
3611   MachineMemOperand *MMO = *MI.memoperands_begin();
3612   const int MemSize = MMO->getSize();
3613 
3614   unsigned ImmOffset;
3615   unsigned TotalOffset;
3616 
3617   // The typed intrinsics add an immediate after the registers.
3618   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3619 
3620   // The struct intrinsic variants add one additional operand over raw.
3621   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3622   Register VIndex;
3623   int OpOffset = 0;
3624   if (HasVIndex) {
3625     VIndex = MI.getOperand(3).getReg();
3626     OpOffset = 1;
3627   }
3628 
3629   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3630   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3631 
3632   unsigned Format = 0;
3633   if (IsTyped) {
3634     Format = MI.getOperand(5 + OpOffset).getImm();
3635     ++OpOffset;
3636   }
3637 
3638   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3639 
3640   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3641   if (TotalOffset != 0)
3642     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3643 
3644   unsigned Opc;
3645   if (IsTyped) {
3646     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3647                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3648   } else if (IsFormat) {
3649     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3650                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3651   } else {
3652     switch (MemSize) {
3653     case 1:
3654       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3655       break;
3656     case 2:
3657       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3658       break;
3659     default:
3660       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3661       break;
3662     }
3663   }
3664 
3665   if (!VIndex)
3666     VIndex = B.buildConstant(S32, 0).getReg(0);
3667 
3668   auto MIB = B.buildInstr(Opc)
3669     .addUse(VData)              // vdata
3670     .addUse(RSrc)               // rsrc
3671     .addUse(VIndex)             // vindex
3672     .addUse(VOffset)            // voffset
3673     .addUse(SOffset)            // soffset
3674     .addImm(ImmOffset);         // offset(imm)
3675 
3676   if (IsTyped)
3677     MIB.addImm(Format);
3678 
3679   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3680      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3681      .addMemOperand(MMO);
3682 
3683   MI.eraseFromParent();
3684   return true;
3685 }
3686 
3687 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3688                                              MachineRegisterInfo &MRI,
3689                                              MachineIRBuilder &B,
3690                                              bool IsFormat,
3691                                              bool IsTyped) const {
3692   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3693   MachineMemOperand *MMO = *MI.memoperands_begin();
3694   const int MemSize = MMO->getSize();
3695   const LLT S32 = LLT::scalar(32);
3696 
3697   Register Dst = MI.getOperand(0).getReg();
3698   Register RSrc = MI.getOperand(2).getReg();
3699 
3700   // The typed intrinsics add an immediate after the registers.
3701   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3702 
3703   // The struct intrinsic variants add one additional operand over raw.
3704   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3705   Register VIndex;
3706   int OpOffset = 0;
3707   if (HasVIndex) {
3708     VIndex = MI.getOperand(3).getReg();
3709     OpOffset = 1;
3710   }
3711 
3712   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3713   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3714 
3715   unsigned Format = 0;
3716   if (IsTyped) {
3717     Format = MI.getOperand(5 + OpOffset).getImm();
3718     ++OpOffset;
3719   }
3720 
3721   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3722   unsigned ImmOffset;
3723   unsigned TotalOffset;
3724 
3725   LLT Ty = MRI.getType(Dst);
3726   LLT EltTy = Ty.getScalarType();
3727   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3728   const bool Unpacked = ST.hasUnpackedD16VMem();
3729 
3730   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3731   if (TotalOffset != 0)
3732     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3733 
3734   unsigned Opc;
3735 
3736   if (IsTyped) {
3737     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3738                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3739   } else if (IsFormat) {
3740     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3741                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3742   } else {
3743     switch (MemSize) {
3744     case 1:
3745       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3746       break;
3747     case 2:
3748       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3749       break;
3750     default:
3751       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3752       break;
3753     }
3754   }
3755 
3756   Register LoadDstReg;
3757 
3758   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3759   LLT UnpackedTy = Ty.changeElementSize(32);
3760 
3761   if (IsExtLoad)
3762     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3763   else if (Unpacked && IsD16 && Ty.isVector())
3764     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3765   else
3766     LoadDstReg = Dst;
3767 
3768   if (!VIndex)
3769     VIndex = B.buildConstant(S32, 0).getReg(0);
3770 
3771   auto MIB = B.buildInstr(Opc)
3772     .addDef(LoadDstReg)         // vdata
3773     .addUse(RSrc)               // rsrc
3774     .addUse(VIndex)             // vindex
3775     .addUse(VOffset)            // voffset
3776     .addUse(SOffset)            // soffset
3777     .addImm(ImmOffset);         // offset(imm)
3778 
3779   if (IsTyped)
3780     MIB.addImm(Format);
3781 
3782   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3783      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3784      .addMemOperand(MMO);
3785 
3786   if (LoadDstReg != Dst) {
3787     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3788 
3789     // Widen result for extending loads was widened.
3790     if (IsExtLoad)
3791       B.buildTrunc(Dst, LoadDstReg);
3792     else {
3793       // Repack to original 16-bit vector result
3794       // FIXME: G_TRUNC should work, but legalization currently fails
3795       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3796       SmallVector<Register, 4> Repack;
3797       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3798         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3799       B.buildMerge(Dst, Repack);
3800     }
3801   }
3802 
3803   MI.eraseFromParent();
3804   return true;
3805 }
3806 
3807 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3808                                                MachineIRBuilder &B,
3809                                                bool IsInc) const {
3810   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3811                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3812   B.buildInstr(Opc)
3813     .addDef(MI.getOperand(0).getReg())
3814     .addUse(MI.getOperand(2).getReg())
3815     .addUse(MI.getOperand(3).getReg())
3816     .cloneMemRefs(MI);
3817   MI.eraseFromParent();
3818   return true;
3819 }
3820 
3821 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3822   switch (IntrID) {
3823   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3824   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3825     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3826   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3827   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3828     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3829   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3830   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3831     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3832   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3833   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3834     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3835   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3836   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3837     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3838   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3839   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3840     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3841   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3842   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3843     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3844   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3845   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3846     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3847   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3848   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3849     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3850   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3851   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3852     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3853   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3854   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3855     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3856   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3857   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3858     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3859   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3860   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3861     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3862   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3863   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3864     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3865   default:
3866     llvm_unreachable("unhandled atomic opcode");
3867   }
3868 }
3869 
3870 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3871                                                MachineIRBuilder &B,
3872                                                Intrinsic::ID IID) const {
3873   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3874                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3875   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3876 
3877   Register Dst;
3878 
3879   int OpOffset = 0;
3880   if (HasReturn) {
3881     // A few FP atomics do not support return values.
3882     Dst = MI.getOperand(0).getReg();
3883   } else {
3884     OpOffset = -1;
3885   }
3886 
3887   Register VData = MI.getOperand(2 + OpOffset).getReg();
3888   Register CmpVal;
3889 
3890   if (IsCmpSwap) {
3891     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3892     ++OpOffset;
3893   }
3894 
3895   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3896   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3897 
3898   // The struct intrinsic variants add one additional operand over raw.
3899   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3900   Register VIndex;
3901   if (HasVIndex) {
3902     VIndex = MI.getOperand(4 + OpOffset).getReg();
3903     ++OpOffset;
3904   }
3905 
3906   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3907   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3908   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3909 
3910   MachineMemOperand *MMO = *MI.memoperands_begin();
3911 
3912   unsigned ImmOffset;
3913   unsigned TotalOffset;
3914   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3915   if (TotalOffset != 0)
3916     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3917 
3918   if (!VIndex)
3919     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3920 
3921   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3922 
3923   if (HasReturn)
3924     MIB.addDef(Dst);
3925 
3926   MIB.addUse(VData); // vdata
3927 
3928   if (IsCmpSwap)
3929     MIB.addReg(CmpVal);
3930 
3931   MIB.addUse(RSrc)               // rsrc
3932      .addUse(VIndex)             // vindex
3933      .addUse(VOffset)            // voffset
3934      .addUse(SOffset)            // soffset
3935      .addImm(ImmOffset)          // offset(imm)
3936      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3937      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3938      .addMemOperand(MMO);
3939 
3940   MI.eraseFromParent();
3941   return true;
3942 }
3943 
3944 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3945 /// vector with s16 typed elements.
3946 static void packImageA16AddressToDwords(
3947     MachineIRBuilder &B, MachineInstr &MI,
3948     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3949     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3950   const LLT S16 = LLT::scalar(16);
3951   const LLT V2S16 = LLT::vector(2, 16);
3952 
3953   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3954     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3955     if (!SrcOp.isReg())
3956       continue; // _L to _LZ may have eliminated this.
3957 
3958     Register AddrReg = SrcOp.getReg();
3959 
3960     if (I < Intr->GradientStart) {
3961       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3962       PackedAddrs.push_back(AddrReg);
3963     } else {
3964       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3965       // derivatives dx/dh and dx/dv are packed with undef.
3966       if (((I + 1) >= EndIdx) ||
3967           ((Intr->NumGradients / 2) % 2 == 1 &&
3968            (I == static_cast<unsigned>(Intr->GradientStart +
3969                                        (Intr->NumGradients / 2) - 1) ||
3970             I == static_cast<unsigned>(Intr->GradientStart +
3971                                        Intr->NumGradients - 1))) ||
3972           // Check for _L to _LZ optimization
3973           !MI.getOperand(ArgOffset + I + 1).isReg()) {
3974         PackedAddrs.push_back(
3975             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3976                 .getReg(0));
3977       } else {
3978         PackedAddrs.push_back(
3979             B.buildBuildVector(
3980                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3981                 .getReg(0));
3982         ++I;
3983       }
3984     }
3985   }
3986 }
3987 
3988 /// Convert from separate vaddr components to a single vector address register,
3989 /// and replace the remaining operands with $noreg.
3990 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3991                                      int DimIdx, int NumVAddrs) {
3992   const LLT S32 = LLT::scalar(32);
3993 
3994   SmallVector<Register, 8> AddrRegs;
3995   for (int I = 0; I != NumVAddrs; ++I) {
3996     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3997     if (SrcOp.isReg()) {
3998       AddrRegs.push_back(SrcOp.getReg());
3999       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4000     }
4001   }
4002 
4003   int NumAddrRegs = AddrRegs.size();
4004   if (NumAddrRegs != 1) {
4005     // Round up to 8 elements for v5-v7
4006     // FIXME: Missing intermediate sized register classes and instructions.
4007     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4008       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4009       auto Undef = B.buildUndef(S32);
4010       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4011       NumAddrRegs = RoundedNumRegs;
4012     }
4013 
4014     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4015     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4016   }
4017 
4018   for (int I = 1; I != NumVAddrs; ++I) {
4019     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4020     if (SrcOp.isReg())
4021       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4022   }
4023 }
4024 
4025 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4026 ///
4027 /// Depending on the subtarget, load/store with 16-bit element data need to be
4028 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4029 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4030 /// registers.
4031 ///
4032 /// We don't want to directly select image instructions just yet, but also want
4033 /// to exposes all register repacking to the legalizer/combiners. We also don't
4034 /// want a selected instrution entering RegBankSelect. In order to avoid
4035 /// defining a multitude of intermediate image instructions, directly hack on
4036 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4037 /// now unnecessary arguments with $noreg.
4038 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4039     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4040     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4041 
4042   const unsigned NumDefs = MI.getNumExplicitDefs();
4043   const unsigned ArgOffset = NumDefs + 1;
4044   bool IsTFE = NumDefs == 2;
4045   // We are only processing the operands of d16 image operations on subtargets
4046   // that use the unpacked register layout, or need to repack the TFE result.
4047 
4048   // TODO: Do we need to guard against already legalized intrinsics?
4049   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4050       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4051 
4052   MachineRegisterInfo *MRI = B.getMRI();
4053   const LLT S32 = LLT::scalar(32);
4054   const LLT S16 = LLT::scalar(16);
4055   const LLT V2S16 = LLT::vector(2, 16);
4056 
4057   unsigned DMask = 0;
4058 
4059   // Check for 16 bit addresses and pack if true.
4060   LLT GradTy =
4061       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4062   LLT AddrTy =
4063       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4064   const bool IsG16 = GradTy == S16;
4065   const bool IsA16 = AddrTy == S16;
4066 
4067   int DMaskLanes = 0;
4068   if (!BaseOpcode->Atomic) {
4069     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4070     if (BaseOpcode->Gather4) {
4071       DMaskLanes = 4;
4072     } else if (DMask != 0) {
4073       DMaskLanes = countPopulation(DMask);
4074     } else if (!IsTFE && !BaseOpcode->Store) {
4075       // If dmask is 0, this is a no-op load. This can be eliminated.
4076       B.buildUndef(MI.getOperand(0));
4077       MI.eraseFromParent();
4078       return true;
4079     }
4080   }
4081 
4082   Observer.changingInstr(MI);
4083   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4084 
4085   unsigned NewOpcode = NumDefs == 0 ?
4086     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4087 
4088   // Track that we legalized this
4089   MI.setDesc(B.getTII().get(NewOpcode));
4090 
4091   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4092   // dmask to be at least 1 otherwise the instruction will fail
4093   if (IsTFE && DMask == 0) {
4094     DMask = 0x1;
4095     DMaskLanes = 1;
4096     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4097   }
4098 
4099   if (BaseOpcode->Atomic) {
4100     Register VData0 = MI.getOperand(2).getReg();
4101     LLT Ty = MRI->getType(VData0);
4102 
4103     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4104     if (Ty.isVector())
4105       return false;
4106 
4107     if (BaseOpcode->AtomicX2) {
4108       Register VData1 = MI.getOperand(3).getReg();
4109       // The two values are packed in one register.
4110       LLT PackedTy = LLT::vector(2, Ty);
4111       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4112       MI.getOperand(2).setReg(Concat.getReg(0));
4113       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4114     }
4115   }
4116 
4117   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4118 
4119   // Optimize _L to _LZ when _L is zero
4120   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4121           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4122     const ConstantFP *ConstantLod;
4123 
4124     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4125                  m_GFCst(ConstantLod))) {
4126       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4127         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4128         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4129             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4130                                                       Intr->Dim);
4131 
4132         // The starting indexes should remain in the same place.
4133         --CorrectedNumVAddrs;
4134 
4135         MI.getOperand(MI.getNumExplicitDefs())
4136             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4137         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4138         Intr = NewImageDimIntr;
4139       }
4140     }
4141   }
4142 
4143   // Optimize _mip away, when 'lod' is zero
4144   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4145     int64_t ConstantLod;
4146     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4147                  m_ICst(ConstantLod))) {
4148       if (ConstantLod == 0) {
4149         // TODO: Change intrinsic opcode and remove operand instead or replacing
4150         // it with 0, as the _L to _LZ handling is done above.
4151         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4152         --CorrectedNumVAddrs;
4153       }
4154     }
4155   }
4156 
4157   // Rewrite the addressing register layout before doing anything else.
4158   if (IsA16 || IsG16) {
4159     if (IsA16) {
4160       // Target must support the feature and gradients need to be 16 bit too
4161       if (!ST.hasA16() || !IsG16)
4162         return false;
4163     } else if (!ST.hasG16())
4164       return false;
4165 
4166     if (Intr->NumVAddrs > 1) {
4167       SmallVector<Register, 4> PackedRegs;
4168       // Don't compress addresses for G16
4169       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4170       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4171                                   PackEndIdx);
4172 
4173       if (!IsA16) {
4174         // Add uncompressed address
4175         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4176           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4177           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4178           PackedRegs.push_back(AddrReg);
4179         }
4180       }
4181 
4182       // See also below in the non-a16 branch
4183       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4184 
4185       if (!UseNSA && PackedRegs.size() > 1) {
4186         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4187         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4188         PackedRegs[0] = Concat.getReg(0);
4189         PackedRegs.resize(1);
4190       }
4191 
4192       const unsigned NumPacked = PackedRegs.size();
4193       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4194         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4195         if (!SrcOp.isReg()) {
4196           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4197           continue;
4198         }
4199 
4200         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4201 
4202         if (I - Intr->VAddrStart < NumPacked)
4203           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4204         else
4205           SrcOp.setReg(AMDGPU::NoRegister);
4206       }
4207     }
4208   } else {
4209     // If the register allocator cannot place the address registers contiguously
4210     // without introducing moves, then using the non-sequential address encoding
4211     // is always preferable, since it saves VALU instructions and is usually a
4212     // wash in terms of code size or even better.
4213     //
4214     // However, we currently have no way of hinting to the register allocator
4215     // that MIMG addresses should be placed contiguously when it is possible to
4216     // do so, so force non-NSA for the common 2-address case as a heuristic.
4217     //
4218     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4219     // allocation when possible.
4220     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4221 
4222     if (!UseNSA && Intr->NumVAddrs > 1)
4223       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4224                                Intr->NumVAddrs);
4225   }
4226 
4227   int Flags = 0;
4228   if (IsA16)
4229     Flags |= 1;
4230   if (IsG16)
4231     Flags |= 2;
4232   MI.addOperand(MachineOperand::CreateImm(Flags));
4233 
4234   if (BaseOpcode->Store) { // No TFE for stores?
4235     // TODO: Handle dmask trim
4236     Register VData = MI.getOperand(1).getReg();
4237     LLT Ty = MRI->getType(VData);
4238     if (!Ty.isVector() || Ty.getElementType() != S16)
4239       return true;
4240 
4241     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4242     if (RepackedReg != VData) {
4243       MI.getOperand(1).setReg(RepackedReg);
4244     }
4245 
4246     return true;
4247   }
4248 
4249   Register DstReg = MI.getOperand(0).getReg();
4250   LLT Ty = MRI->getType(DstReg);
4251   const LLT EltTy = Ty.getScalarType();
4252   const bool IsD16 = Ty.getScalarType() == S16;
4253   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4254 
4255   // Confirm that the return type is large enough for the dmask specified
4256   if (NumElts < DMaskLanes)
4257     return false;
4258 
4259   if (NumElts > 4 || DMaskLanes > 4)
4260     return false;
4261 
4262   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4263   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4264 
4265   // The raw dword aligned data component of the load. The only legal cases
4266   // where this matters should be when using the packed D16 format, for
4267   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4268   LLT RoundedTy;
4269 
4270   // S32 vector to to cover all data, plus TFE result element.
4271   LLT TFETy;
4272 
4273   // Register type to use for each loaded component. Will be S32 or V2S16.
4274   LLT RegTy;
4275 
4276   if (IsD16 && ST.hasUnpackedD16VMem()) {
4277     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4278     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4279     RegTy = S32;
4280   } else {
4281     unsigned EltSize = EltTy.getSizeInBits();
4282     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4283     unsigned RoundedSize = 32 * RoundedElts;
4284     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4285     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4286     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4287   }
4288 
4289   // The return type does not need adjustment.
4290   // TODO: Should we change s16 case to s32 or <2 x s16>?
4291   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4292     return true;
4293 
4294   Register Dst1Reg;
4295 
4296   // Insert after the instruction.
4297   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4298 
4299   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4300   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4301   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4302   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4303 
4304   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4305 
4306   MI.getOperand(0).setReg(NewResultReg);
4307 
4308   // In the IR, TFE is supposed to be used with a 2 element struct return
4309   // type. The intruction really returns these two values in one contiguous
4310   // register, with one additional dword beyond the loaded data. Rewrite the
4311   // return type to use a single register result.
4312 
4313   if (IsTFE) {
4314     Dst1Reg = MI.getOperand(1).getReg();
4315     if (MRI->getType(Dst1Reg) != S32)
4316       return false;
4317 
4318     // TODO: Make sure the TFE operand bit is set.
4319     MI.RemoveOperand(1);
4320 
4321     // Handle the easy case that requires no repack instructions.
4322     if (Ty == S32) {
4323       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4324       return true;
4325     }
4326   }
4327 
4328   // Now figure out how to copy the new result register back into the old
4329   // result.
4330   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4331 
4332   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4333 
4334   if (ResultNumRegs == 1) {
4335     assert(!IsTFE);
4336     ResultRegs[0] = NewResultReg;
4337   } else {
4338     // We have to repack into a new vector of some kind.
4339     for (int I = 0; I != NumDataRegs; ++I)
4340       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4341     B.buildUnmerge(ResultRegs, NewResultReg);
4342 
4343     // Drop the final TFE element to get the data part. The TFE result is
4344     // directly written to the right place already.
4345     if (IsTFE)
4346       ResultRegs.resize(NumDataRegs);
4347   }
4348 
4349   // For an s16 scalar result, we form an s32 result with a truncate regardless
4350   // of packed vs. unpacked.
4351   if (IsD16 && !Ty.isVector()) {
4352     B.buildTrunc(DstReg, ResultRegs[0]);
4353     return true;
4354   }
4355 
4356   // Avoid a build/concat_vector of 1 entry.
4357   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4358     B.buildBitcast(DstReg, ResultRegs[0]);
4359     return true;
4360   }
4361 
4362   assert(Ty.isVector());
4363 
4364   if (IsD16) {
4365     // For packed D16 results with TFE enabled, all the data components are
4366     // S32. Cast back to the expected type.
4367     //
4368     // TODO: We don't really need to use load s32 elements. We would only need one
4369     // cast for the TFE result if a multiple of v2s16 was used.
4370     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4371       for (Register &Reg : ResultRegs)
4372         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4373     } else if (ST.hasUnpackedD16VMem()) {
4374       for (Register &Reg : ResultRegs)
4375         Reg = B.buildTrunc(S16, Reg).getReg(0);
4376     }
4377   }
4378 
4379   auto padWithUndef = [&](LLT Ty, int NumElts) {
4380     if (NumElts == 0)
4381       return;
4382     Register Undef = B.buildUndef(Ty).getReg(0);
4383     for (int I = 0; I != NumElts; ++I)
4384       ResultRegs.push_back(Undef);
4385   };
4386 
4387   // Pad out any elements eliminated due to the dmask.
4388   LLT ResTy = MRI->getType(ResultRegs[0]);
4389   if (!ResTy.isVector()) {
4390     padWithUndef(ResTy, NumElts - ResultRegs.size());
4391     B.buildBuildVector(DstReg, ResultRegs);
4392     return true;
4393   }
4394 
4395   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4396   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4397 
4398   // Deal with the one annoying legal case.
4399   const LLT V3S16 = LLT::vector(3, 16);
4400   if (Ty == V3S16) {
4401     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4402     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4403     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4404     return true;
4405   }
4406 
4407   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4408   B.buildConcatVectors(DstReg, ResultRegs);
4409   return true;
4410 }
4411 
4412 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4413   LegalizerHelper &Helper, MachineInstr &MI) const {
4414   MachineIRBuilder &B = Helper.MIRBuilder;
4415   GISelChangeObserver &Observer = Helper.Observer;
4416 
4417   Register Dst = MI.getOperand(0).getReg();
4418   LLT Ty = B.getMRI()->getType(Dst);
4419   unsigned Size = Ty.getSizeInBits();
4420   MachineFunction &MF = B.getMF();
4421 
4422   Observer.changingInstr(MI);
4423 
4424   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4425     Ty = getBitcastRegisterType(Ty);
4426     Helper.bitcastDst(MI, Ty, 0);
4427     Dst = MI.getOperand(0).getReg();
4428     B.setInsertPt(B.getMBB(), MI);
4429   }
4430 
4431   // FIXME: We don't really need this intermediate instruction. The intrinsic
4432   // should be fixed to have a memory operand. Since it's readnone, we're not
4433   // allowed to add one.
4434   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4435   MI.RemoveOperand(1); // Remove intrinsic ID
4436 
4437   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4438   // TODO: Should this use datalayout alignment?
4439   const unsigned MemSize = (Size + 7) / 8;
4440   const Align MemAlign(4);
4441   MachineMemOperand *MMO = MF.getMachineMemOperand(
4442       MachinePointerInfo(),
4443       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4444           MachineMemOperand::MOInvariant,
4445       MemSize, MemAlign);
4446   MI.addMemOperand(MF, MMO);
4447 
4448   // There are no 96-bit result scalar loads, but widening to 128-bit should
4449   // always be legal. We may need to restore this to a 96-bit result if it turns
4450   // out this needs to be converted to a vector load during RegBankSelect.
4451   if (!isPowerOf2_32(Size)) {
4452     if (Ty.isVector())
4453       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4454     else
4455       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4456   }
4457 
4458   Observer.changedInstr(MI);
4459   return true;
4460 }
4461 
4462 // TODO: Move to selection
4463 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4464                                                 MachineRegisterInfo &MRI,
4465                                                 MachineIRBuilder &B) const {
4466   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4467   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4468       !ST.isTrapHandlerEnabled()) {
4469     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4470   } else {
4471     // Pass queue pointer to trap handler as input, and insert trap instruction
4472     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4473     MachineRegisterInfo &MRI = *B.getMRI();
4474 
4475     Register LiveIn =
4476       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4477     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4478       return false;
4479 
4480     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4481     B.buildCopy(SGPR01, LiveIn);
4482     B.buildInstr(AMDGPU::S_TRAP)
4483         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4484         .addReg(SGPR01, RegState::Implicit);
4485   }
4486 
4487   MI.eraseFromParent();
4488   return true;
4489 }
4490 
4491 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4492     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4493   // Is non-HSA path or trap-handler disabled? then, report a warning
4494   // accordingly
4495   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4496       !ST.isTrapHandlerEnabled()) {
4497     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4498                                      "debugtrap handler not supported",
4499                                      MI.getDebugLoc(), DS_Warning);
4500     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4501     Ctx.diagnose(NoTrap);
4502   } else {
4503     // Insert debug-trap instruction
4504     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4505   }
4506 
4507   MI.eraseFromParent();
4508   return true;
4509 }
4510 
4511 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4512                                                MachineIRBuilder &B) const {
4513   MachineRegisterInfo &MRI = *B.getMRI();
4514   const LLT S16 = LLT::scalar(16);
4515   const LLT S32 = LLT::scalar(32);
4516 
4517   Register DstReg = MI.getOperand(0).getReg();
4518   Register NodePtr = MI.getOperand(2).getReg();
4519   Register RayExtent = MI.getOperand(3).getReg();
4520   Register RayOrigin = MI.getOperand(4).getReg();
4521   Register RayDir = MI.getOperand(5).getReg();
4522   Register RayInvDir = MI.getOperand(6).getReg();
4523   Register TDescr = MI.getOperand(7).getReg();
4524 
4525   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4526   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4527   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4528                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4529                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4530                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4531 
4532   SmallVector<Register, 12> Ops;
4533   if (Is64) {
4534     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4535     Ops.push_back(Unmerge.getReg(0));
4536     Ops.push_back(Unmerge.getReg(1));
4537   } else {
4538     Ops.push_back(NodePtr);
4539   }
4540   Ops.push_back(RayExtent);
4541 
4542   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4543     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4544     Ops.push_back(Unmerge.getReg(0));
4545     Ops.push_back(Unmerge.getReg(1));
4546     Ops.push_back(Unmerge.getReg(2));
4547   };
4548 
4549   packLanes(RayOrigin);
4550   if (IsA16) {
4551     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4552     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4553     Register R1 = MRI.createGenericVirtualRegister(S32);
4554     Register R2 = MRI.createGenericVirtualRegister(S32);
4555     Register R3 = MRI.createGenericVirtualRegister(S32);
4556     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4557     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4558     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4559     Ops.push_back(R1);
4560     Ops.push_back(R2);
4561     Ops.push_back(R3);
4562   } else {
4563     packLanes(RayDir);
4564     packLanes(RayInvDir);
4565   }
4566 
4567   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4568     .addDef(DstReg)
4569     .addImm(Opcode);
4570 
4571   for (Register R : Ops) {
4572     MIB.addUse(R);
4573   }
4574 
4575   MIB.addUse(TDescr)
4576      .addImm(IsA16 ? 1 : 0)
4577      .cloneMemRefs(MI);
4578 
4579   MI.eraseFromParent();
4580   return true;
4581 }
4582 
4583 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4584                                             MachineInstr &MI) const {
4585   MachineIRBuilder &B = Helper.MIRBuilder;
4586   MachineRegisterInfo &MRI = *B.getMRI();
4587 
4588   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4589   auto IntrID = MI.getIntrinsicID();
4590   switch (IntrID) {
4591   case Intrinsic::amdgcn_if:
4592   case Intrinsic::amdgcn_else: {
4593     MachineInstr *Br = nullptr;
4594     MachineBasicBlock *UncondBrTarget = nullptr;
4595     bool Negated = false;
4596     if (MachineInstr *BrCond =
4597             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4598       const SIRegisterInfo *TRI
4599         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4600 
4601       Register Def = MI.getOperand(1).getReg();
4602       Register Use = MI.getOperand(3).getReg();
4603 
4604       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4605 
4606       if (Negated)
4607         std::swap(CondBrTarget, UncondBrTarget);
4608 
4609       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4610       if (IntrID == Intrinsic::amdgcn_if) {
4611         B.buildInstr(AMDGPU::SI_IF)
4612           .addDef(Def)
4613           .addUse(Use)
4614           .addMBB(UncondBrTarget);
4615       } else {
4616         B.buildInstr(AMDGPU::SI_ELSE)
4617             .addDef(Def)
4618             .addUse(Use)
4619             .addMBB(UncondBrTarget);
4620       }
4621 
4622       if (Br) {
4623         Br->getOperand(0).setMBB(CondBrTarget);
4624       } else {
4625         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4626         // since we're swapping branch targets it needs to be reinserted.
4627         // FIXME: IRTranslator should probably not do this
4628         B.buildBr(*CondBrTarget);
4629       }
4630 
4631       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4632       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4633       MI.eraseFromParent();
4634       BrCond->eraseFromParent();
4635       return true;
4636     }
4637 
4638     return false;
4639   }
4640   case Intrinsic::amdgcn_loop: {
4641     MachineInstr *Br = nullptr;
4642     MachineBasicBlock *UncondBrTarget = nullptr;
4643     bool Negated = false;
4644     if (MachineInstr *BrCond =
4645             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4646       const SIRegisterInfo *TRI
4647         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4648 
4649       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4650       Register Reg = MI.getOperand(2).getReg();
4651 
4652       if (Negated)
4653         std::swap(CondBrTarget, UncondBrTarget);
4654 
4655       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4656       B.buildInstr(AMDGPU::SI_LOOP)
4657         .addUse(Reg)
4658         .addMBB(UncondBrTarget);
4659 
4660       if (Br)
4661         Br->getOperand(0).setMBB(CondBrTarget);
4662       else
4663         B.buildBr(*CondBrTarget);
4664 
4665       MI.eraseFromParent();
4666       BrCond->eraseFromParent();
4667       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4668       return true;
4669     }
4670 
4671     return false;
4672   }
4673   case Intrinsic::amdgcn_kernarg_segment_ptr:
4674     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4675       // This only makes sense to call in a kernel, so just lower to null.
4676       B.buildConstant(MI.getOperand(0).getReg(), 0);
4677       MI.eraseFromParent();
4678       return true;
4679     }
4680 
4681     return legalizePreloadedArgIntrin(
4682       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4683   case Intrinsic::amdgcn_implicitarg_ptr:
4684     return legalizeImplicitArgPtr(MI, MRI, B);
4685   case Intrinsic::amdgcn_workitem_id_x:
4686     return legalizePreloadedArgIntrin(MI, MRI, B,
4687                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4688   case Intrinsic::amdgcn_workitem_id_y:
4689     return legalizePreloadedArgIntrin(MI, MRI, B,
4690                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4691   case Intrinsic::amdgcn_workitem_id_z:
4692     return legalizePreloadedArgIntrin(MI, MRI, B,
4693                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4694   case Intrinsic::amdgcn_workgroup_id_x:
4695     return legalizePreloadedArgIntrin(MI, MRI, B,
4696                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4697   case Intrinsic::amdgcn_workgroup_id_y:
4698     return legalizePreloadedArgIntrin(MI, MRI, B,
4699                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4700   case Intrinsic::amdgcn_workgroup_id_z:
4701     return legalizePreloadedArgIntrin(MI, MRI, B,
4702                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4703   case Intrinsic::amdgcn_dispatch_ptr:
4704     return legalizePreloadedArgIntrin(MI, MRI, B,
4705                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4706   case Intrinsic::amdgcn_queue_ptr:
4707     return legalizePreloadedArgIntrin(MI, MRI, B,
4708                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4709   case Intrinsic::amdgcn_implicit_buffer_ptr:
4710     return legalizePreloadedArgIntrin(
4711       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4712   case Intrinsic::amdgcn_dispatch_id:
4713     return legalizePreloadedArgIntrin(MI, MRI, B,
4714                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4715   case Intrinsic::amdgcn_fdiv_fast:
4716     return legalizeFDIVFastIntrin(MI, MRI, B);
4717   case Intrinsic::amdgcn_is_shared:
4718     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4719   case Intrinsic::amdgcn_is_private:
4720     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4721   case Intrinsic::amdgcn_wavefrontsize: {
4722     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4723     MI.eraseFromParent();
4724     return true;
4725   }
4726   case Intrinsic::amdgcn_s_buffer_load:
4727     return legalizeSBufferLoad(Helper, MI);
4728   case Intrinsic::amdgcn_raw_buffer_store:
4729   case Intrinsic::amdgcn_struct_buffer_store:
4730     return legalizeBufferStore(MI, MRI, B, false, false);
4731   case Intrinsic::amdgcn_raw_buffer_store_format:
4732   case Intrinsic::amdgcn_struct_buffer_store_format:
4733     return legalizeBufferStore(MI, MRI, B, false, true);
4734   case Intrinsic::amdgcn_raw_tbuffer_store:
4735   case Intrinsic::amdgcn_struct_tbuffer_store:
4736     return legalizeBufferStore(MI, MRI, B, true, true);
4737   case Intrinsic::amdgcn_raw_buffer_load:
4738   case Intrinsic::amdgcn_struct_buffer_load:
4739     return legalizeBufferLoad(MI, MRI, B, false, false);
4740   case Intrinsic::amdgcn_raw_buffer_load_format:
4741   case Intrinsic::amdgcn_struct_buffer_load_format:
4742     return legalizeBufferLoad(MI, MRI, B, true, false);
4743   case Intrinsic::amdgcn_raw_tbuffer_load:
4744   case Intrinsic::amdgcn_struct_tbuffer_load:
4745     return legalizeBufferLoad(MI, MRI, B, true, true);
4746   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4747   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4748   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4749   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4750   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4751   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4752   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4753   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4754   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4755   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4756   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4757   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4758   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4759   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4760   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4761   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4762   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4763   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4764   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4765   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4766   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4767   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4768   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4769   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4770   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4771   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4772   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4773   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4774     return legalizeBufferAtomic(MI, B, IntrID);
4775   case Intrinsic::amdgcn_atomic_inc:
4776     return legalizeAtomicIncDec(MI, B, true);
4777   case Intrinsic::amdgcn_atomic_dec:
4778     return legalizeAtomicIncDec(MI, B, false);
4779   case Intrinsic::trap:
4780     return legalizeTrapIntrinsic(MI, MRI, B);
4781   case Intrinsic::debugtrap:
4782     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4783   case Intrinsic::amdgcn_rsq_clamp:
4784     return legalizeRsqClampIntrinsic(MI, MRI, B);
4785   case Intrinsic::amdgcn_ds_fadd:
4786   case Intrinsic::amdgcn_ds_fmin:
4787   case Intrinsic::amdgcn_ds_fmax:
4788     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4789   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4790     return legalizeBVHIntrinsic(MI, B);
4791   default: {
4792     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4793             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4794       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4795     return true;
4796   }
4797   }
4798 
4799   return true;
4800 }
4801