1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/CodeGen/ValueTypes.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 
32 #define DEBUG_TYPE "amdgpu-legalinfo"
33 
34 using namespace llvm;
35 using namespace LegalizeActions;
36 using namespace LegalizeMutations;
37 using namespace LegalityPredicates;
38 using namespace MIPatternMatch;
39 
40 // Hack until load/store selection patterns support any tuple of legal types.
41 static cl::opt<bool> EnableNewLegality(
42   "amdgpu-global-isel-new-legality",
43   cl::desc("Use GlobalISel desired legality, rather than try to use"
44            "rules compatible with selection patterns"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 static constexpr unsigned MaxRegisterSize = 1024;
49 
50 // Round the number of elements to the next power of two elements
51 static LLT getPow2VectorType(LLT Ty) {
52   unsigned NElts = Ty.getNumElements();
53   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
54   return Ty.changeNumElements(Pow2NElts);
55 }
56 
57 // Round the number of bits to the next power of two bits
58 static LLT getPow2ScalarType(LLT Ty) {
59   unsigned Bits = Ty.getSizeInBits();
60   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
61   return LLT::scalar(Pow2Bits);
62 }
63 
64 /// \returs true if this is an odd sized vector which should widen by adding an
65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66 /// excludes s1 vectors, which should always be scalarized.
67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     if (!Ty.isVector())
71       return false;
72 
73     const LLT EltTy = Ty.getElementType();
74     const unsigned EltSize = EltTy.getSizeInBits();
75     return Ty.getNumElements() % 2 != 0 &&
76            EltSize > 1 && EltSize < 32 &&
77            Ty.getSizeInBits() % 32 != 0;
78   };
79 }
80 
81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     return Ty.getSizeInBits() % 32 == 0;
85   };
86 }
87 
88 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getScalarType();
92     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
93   };
94 }
95 
96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
112   };
113 }
114 
115 // Increase the number of vector elements to reach the next multiple of 32-bit
116 // type.
117 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
118   return [=](const LegalityQuery &Query) {
119     const LLT Ty = Query.Types[TypeIdx];
120 
121     const LLT EltTy = Ty.getElementType();
122     const int Size = Ty.getSizeInBits();
123     const int EltSize = EltTy.getSizeInBits();
124     const int NextMul32 = (Size + 31) / 32;
125 
126     assert(EltSize < 32);
127 
128     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
129     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
130   };
131 }
132 
133 static LLT getBitcastRegisterType(const LLT Ty) {
134   const unsigned Size = Ty.getSizeInBits();
135 
136   LLT CoercedTy;
137   if (Size <= 32) {
138     // <2 x s8> -> s16
139     // <4 x s8> -> s32
140     return LLT::scalar(Size);
141   }
142 
143   return LLT::scalarOrVector(Size / 32, 32);
144 }
145 
146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147   return [=](const LegalityQuery &Query) {
148     const LLT Ty = Query.Types[TypeIdx];
149     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
150   };
151 }
152 
153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     unsigned Size = Ty.getSizeInBits();
157     assert(Size % 32 == 0);
158     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
159   };
160 }
161 
162 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
163   return [=](const LegalityQuery &Query) {
164     const LLT QueryTy = Query.Types[TypeIdx];
165     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
166   };
167 }
168 
169 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
170   return [=](const LegalityQuery &Query) {
171     const LLT QueryTy = Query.Types[TypeIdx];
172     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
173   };
174 }
175 
176 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
177   return [=](const LegalityQuery &Query) {
178     const LLT QueryTy = Query.Types[TypeIdx];
179     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
180   };
181 }
182 
183 static bool isRegisterSize(unsigned Size) {
184   return Size % 32 == 0 && Size <= MaxRegisterSize;
185 }
186 
187 static bool isRegisterVectorElementType(LLT EltTy) {
188   const int EltSize = EltTy.getSizeInBits();
189   return EltSize == 16 || EltSize % 32 == 0;
190 }
191 
192 static bool isRegisterVectorType(LLT Ty) {
193   const int EltSize = Ty.getElementType().getSizeInBits();
194   return EltSize == 32 || EltSize == 64 ||
195          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
196          EltSize == 128 || EltSize == 256;
197 }
198 
199 static bool isRegisterType(LLT Ty) {
200   if (!isRegisterSize(Ty.getSizeInBits()))
201     return false;
202 
203   if (Ty.isVector())
204     return isRegisterVectorType(Ty);
205 
206   return true;
207 }
208 
209 // Any combination of 32 or 64-bit elements up the maximum register size, and
210 // multiples of v2s16.
211 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
212   return [=](const LegalityQuery &Query) {
213     return isRegisterType(Query.Types[TypeIdx]);
214   };
215 }
216 
217 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
218   return [=](const LegalityQuery &Query) {
219     const LLT QueryTy = Query.Types[TypeIdx];
220     if (!QueryTy.isVector())
221       return false;
222     const LLT EltTy = QueryTy.getElementType();
223     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
224   };
225 }
226 
227 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
228   return [=](const LegalityQuery &Query) {
229     const LLT Ty = Query.Types[TypeIdx];
230     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232   };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239                                     bool IsLoad) {
240   switch (AS) {
241   case AMDGPUAS::PRIVATE_ADDRESS:
242     // FIXME: Private element size.
243     return ST.enableFlatScratch() ? 128 : 32;
244   case AMDGPUAS::LOCAL_ADDRESS:
245     return ST.useDS128() ? 128 : 64;
246   case AMDGPUAS::GLOBAL_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS:
248   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
249     // Treat constant and global as identical. SMRD loads are sometimes usable for
250     // global loads (ideally constant address space should be eliminated)
251     // depending on the context. Legality cannot be context dependent, but
252     // RegBankSelect can split the load as necessary depending on the pointer
253     // register bank/uniformity and if the memory is invariant or not written in a
254     // kernel.
255     return IsLoad ? 512 : 128;
256   default:
257     // Flat addresses may contextually need to be split to 32-bit parts if they
258     // may alias scratch depending on the subtarget.
259     return 128;
260   }
261 }
262 
263 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
264                                  const LegalityQuery &Query,
265                                  unsigned Opcode) {
266   const LLT Ty = Query.Types[0];
267 
268   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
269   const bool IsLoad = Opcode != AMDGPU::G_STORE;
270 
271   unsigned RegSize = Ty.getSizeInBits();
272   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
273   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
274   unsigned AS = Query.Types[1].getAddressSpace();
275 
276   // All of these need to be custom lowered to cast the pointer operand.
277   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
278     return false;
279 
280   // TODO: We should be able to widen loads if the alignment is high enough, but
281   // we also need to modify the memory access size.
282 #if 0
283   // Accept widening loads based on alignment.
284   if (IsLoad && MemSize < Size)
285     MemSize = std::max(MemSize, Align);
286 #endif
287 
288   // Only 1-byte and 2-byte to 32-bit extloads are valid.
289   if (MemSize != RegSize && RegSize != 32)
290     return false;
291 
292   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
293     return false;
294 
295   switch (MemSize) {
296   case 8:
297   case 16:
298   case 32:
299   case 64:
300   case 128:
301     break;
302   case 96:
303     if (!ST.hasDwordx3LoadStores())
304       return false;
305     break;
306   case 256:
307   case 512:
308     // These may contextually need to be broken down.
309     break;
310   default:
311     return false;
312   }
313 
314   assert(RegSize >= MemSize);
315 
316   if (AlignBits < MemSize) {
317     const SITargetLowering *TLI = ST.getTargetLowering();
318     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
319                                                  Align(AlignBits / 8)))
320       return false;
321   }
322 
323   return true;
324 }
325 
326 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
327 // workaround this. Eventually it should ignore the type for loads and only care
328 // about the size. Return true in cases where we will workaround this for now by
329 // bitcasting.
330 static bool loadStoreBitcastWorkaround(const LLT Ty) {
331   if (EnableNewLegality)
332     return false;
333 
334   const unsigned Size = Ty.getSizeInBits();
335   if (Size <= 64)
336     return false;
337   if (!Ty.isVector())
338     return true;
339 
340   LLT EltTy = Ty.getElementType();
341   if (EltTy.isPointer())
342     return true;
343 
344   unsigned EltSize = EltTy.getSizeInBits();
345   return EltSize != 32 && EltSize != 64;
346 }
347 
348 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
349                              unsigned Opcode) {
350   const LLT Ty = Query.Types[0];
351   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
352          !loadStoreBitcastWorkaround(Ty);
353 }
354 
355 /// Return true if a load or store of the type should be lowered with a bitcast
356 /// to a different type.
357 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
358                                        const unsigned MemSizeInBits) {
359   const unsigned Size = Ty.getSizeInBits();
360     if (Size != MemSizeInBits)
361       return Size <= 32 && Ty.isVector();
362 
363   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
364     return true;
365   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
366          !isRegisterVectorElementType(Ty.getElementType());
367 }
368 
369 /// Return true if we should legalize a load by widening an odd sized memory
370 /// access up to the alignment. Note this case when the memory access itself
371 /// changes, not the size of the result register.
372 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
373                             unsigned AlignInBits, unsigned AddrSpace,
374                             unsigned Opcode) {
375   // We don't want to widen cases that are naturally legal.
376   if (isPowerOf2_32(SizeInBits))
377     return false;
378 
379   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
380   // end up widening these for a scalar load during RegBankSelect, since there
381   // aren't 96-bit scalar loads.
382   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
383     return false;
384 
385   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
386     return false;
387 
388   // A load is known dereferenceable up to the alignment, so it's legal to widen
389   // to it.
390   //
391   // TODO: Could check dereferenceable for less aligned cases.
392   unsigned RoundedSize = NextPowerOf2(SizeInBits);
393   if (AlignInBits < RoundedSize)
394     return false;
395 
396   // Do not widen if it would introduce a slow unaligned load.
397   const SITargetLowering *TLI = ST.getTargetLowering();
398   bool Fast = false;
399   return TLI->allowsMisalignedMemoryAccessesImpl(
400              RoundedSize, AddrSpace, Align(AlignInBits / 8),
401              MachineMemOperand::MOLoad, &Fast) &&
402          Fast;
403 }
404 
405 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
406                             unsigned Opcode) {
407   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
408     return false;
409 
410   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
411                          Query.MMODescrs[0].AlignInBits,
412                          Query.Types[1].getAddressSpace(), Opcode);
413 }
414 
415 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
416                                          const GCNTargetMachine &TM)
417   :  ST(ST_) {
418   using namespace TargetOpcode;
419 
420   auto GetAddrSpacePtr = [&TM](unsigned AS) {
421     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
422   };
423 
424   const LLT S1 = LLT::scalar(1);
425   const LLT S8 = LLT::scalar(8);
426   const LLT S16 = LLT::scalar(16);
427   const LLT S32 = LLT::scalar(32);
428   const LLT S64 = LLT::scalar(64);
429   const LLT S128 = LLT::scalar(128);
430   const LLT S256 = LLT::scalar(256);
431   const LLT S512 = LLT::scalar(512);
432   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
433 
434   const LLT V2S8 = LLT::vector(2, 8);
435   const LLT V2S16 = LLT::vector(2, 16);
436   const LLT V4S16 = LLT::vector(4, 16);
437 
438   const LLT V2S32 = LLT::vector(2, 32);
439   const LLT V3S32 = LLT::vector(3, 32);
440   const LLT V4S32 = LLT::vector(4, 32);
441   const LLT V5S32 = LLT::vector(5, 32);
442   const LLT V6S32 = LLT::vector(6, 32);
443   const LLT V7S32 = LLT::vector(7, 32);
444   const LLT V8S32 = LLT::vector(8, 32);
445   const LLT V9S32 = LLT::vector(9, 32);
446   const LLT V10S32 = LLT::vector(10, 32);
447   const LLT V11S32 = LLT::vector(11, 32);
448   const LLT V12S32 = LLT::vector(12, 32);
449   const LLT V13S32 = LLT::vector(13, 32);
450   const LLT V14S32 = LLT::vector(14, 32);
451   const LLT V15S32 = LLT::vector(15, 32);
452   const LLT V16S32 = LLT::vector(16, 32);
453   const LLT V32S32 = LLT::vector(32, 32);
454 
455   const LLT V2S64 = LLT::vector(2, 64);
456   const LLT V3S64 = LLT::vector(3, 64);
457   const LLT V4S64 = LLT::vector(4, 64);
458   const LLT V5S64 = LLT::vector(5, 64);
459   const LLT V6S64 = LLT::vector(6, 64);
460   const LLT V7S64 = LLT::vector(7, 64);
461   const LLT V8S64 = LLT::vector(8, 64);
462   const LLT V16S64 = LLT::vector(16, 64);
463 
464   std::initializer_list<LLT> AllS32Vectors =
465     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
466      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
467   std::initializer_list<LLT> AllS64Vectors =
468     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
469 
470   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
471   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
472   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
473   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
474   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
475   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
476   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
477 
478   const LLT CodePtr = FlatPtr;
479 
480   const std::initializer_list<LLT> AddrSpaces64 = {
481     GlobalPtr, ConstantPtr, FlatPtr
482   };
483 
484   const std::initializer_list<LLT> AddrSpaces32 = {
485     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
486   };
487 
488   const std::initializer_list<LLT> FPTypesBase = {
489     S32, S64
490   };
491 
492   const std::initializer_list<LLT> FPTypes16 = {
493     S32, S64, S16
494   };
495 
496   const std::initializer_list<LLT> FPTypesPK16 = {
497     S32, S64, S16, V2S16
498   };
499 
500   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
501 
502   setAction({G_BRCOND, S1}, Legal); // VCC branches
503   setAction({G_BRCOND, S32}, Legal); // SCC branches
504 
505   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
506   // elements for v3s16
507   getActionDefinitionsBuilder(G_PHI)
508     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
509     .legalFor(AllS32Vectors)
510     .legalFor(AllS64Vectors)
511     .legalFor(AddrSpaces64)
512     .legalFor(AddrSpaces32)
513     .legalIf(isPointer(0))
514     .clampScalar(0, S16, S256)
515     .widenScalarToNextPow2(0, 32)
516     .clampMaxNumElements(0, S32, 16)
517     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
518     .scalarize(0);
519 
520   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
521     // Full set of gfx9 features.
522     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
523       .legalFor({S32, S16, V2S16})
524       .clampScalar(0, S16, S32)
525       .clampMaxNumElements(0, S16, 2)
526       .scalarize(0)
527       .widenScalarToNextPow2(0, 32);
528 
529     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
530       .legalFor({S32, S16, V2S16}) // Clamp modifier
531       .minScalarOrElt(0, S16)
532       .clampMaxNumElements(0, S16, 2)
533       .scalarize(0)
534       .widenScalarToNextPow2(0, 32)
535       .lower();
536   } else if (ST.has16BitInsts()) {
537     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
538       .legalFor({S32, S16})
539       .clampScalar(0, S16, S32)
540       .scalarize(0)
541       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
542 
543     // Technically the saturating operations require clamp bit support, but this
544     // was introduced at the same time as 16-bit operations.
545     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
546       .legalFor({S32, S16}) // Clamp modifier
547       .minScalar(0, S16)
548       .scalarize(0)
549       .widenScalarToNextPow2(0, 16)
550       .lower();
551 
552     // We're just lowering this, but it helps get a better result to try to
553     // coerce to the desired type first.
554     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
555       .minScalar(0, S16)
556       .scalarize(0)
557       .lower();
558   } else {
559     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
560       .legalFor({S32})
561       .clampScalar(0, S32, S32)
562       .scalarize(0);
563 
564     if (ST.hasIntClamp()) {
565       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
566         .legalFor({S32}) // Clamp modifier.
567         .scalarize(0)
568         .minScalarOrElt(0, S32)
569         .lower();
570     } else {
571       // Clamp bit support was added in VI, along with 16-bit operations.
572       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
573         .minScalar(0, S32)
574         .scalarize(0)
575         .lower();
576     }
577 
578     // FIXME: DAG expansion gets better results. The widening uses the smaller
579     // range values and goes for the min/max lowering directly.
580     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
581       .minScalar(0, S32)
582       .scalarize(0)
583       .lower();
584   }
585 
586   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
587     .customFor({S32, S64})
588     .clampScalar(0, S32, S64)
589     .widenScalarToNextPow2(0, 32)
590     .scalarize(0);
591 
592   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
593                    .legalFor({S32})
594                    .maxScalarOrElt(0, S32);
595 
596   if (ST.hasVOP3PInsts()) {
597     Mulh
598       .clampMaxNumElements(0, S8, 2)
599       .lowerFor({V2S8});
600   }
601 
602   Mulh
603     .scalarize(0)
604     .lower();
605 
606   // Report legal for any types we can handle anywhere. For the cases only legal
607   // on the SALU, RegBankSelect will be able to re-legalize.
608   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
609     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
610     .clampScalar(0, S32, S64)
611     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
612     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
613     .widenScalarToNextPow2(0)
614     .scalarize(0);
615 
616   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
617                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
618     .legalFor({{S32, S1}, {S32, S32}})
619     .minScalar(0, S32)
620     // TODO: .scalarize(0)
621     .lower();
622 
623   getActionDefinitionsBuilder(G_BITCAST)
624     // Don't worry about the size constraint.
625     .legalIf(all(isRegisterType(0), isRegisterType(1)))
626     .lower();
627 
628 
629   getActionDefinitionsBuilder(G_CONSTANT)
630     .legalFor({S1, S32, S64, S16, GlobalPtr,
631                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
632     .legalIf(isPointer(0))
633     .clampScalar(0, S32, S64)
634     .widenScalarToNextPow2(0);
635 
636   getActionDefinitionsBuilder(G_FCONSTANT)
637     .legalFor({S32, S64, S16})
638     .clampScalar(0, S16, S64);
639 
640   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
641       .legalIf(isRegisterType(0))
642       // s1 and s16 are special cases because they have legal operations on
643       // them, but don't really occupy registers in the normal way.
644       .legalFor({S1, S16})
645       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
646       .clampScalarOrElt(0, S32, MaxScalar)
647       .widenScalarToNextPow2(0, 32)
648       .clampMaxNumElements(0, S32, 16);
649 
650   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
651 
652   // If the amount is divergent, we have to do a wave reduction to get the
653   // maximum value, so this is expanded during RegBankSelect.
654   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
655     .legalFor({{PrivatePtr, S32}});
656 
657   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
658     .customIf(typeIsNot(0, PrivatePtr));
659 
660   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
661 
662   auto &FPOpActions = getActionDefinitionsBuilder(
663     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
664     .legalFor({S32, S64});
665   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
666     .customFor({S32, S64});
667   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
668     .customFor({S32, S64});
669 
670   if (ST.has16BitInsts()) {
671     if (ST.hasVOP3PInsts())
672       FPOpActions.legalFor({S16, V2S16});
673     else
674       FPOpActions.legalFor({S16});
675 
676     TrigActions.customFor({S16});
677     FDIVActions.customFor({S16});
678   }
679 
680   auto &MinNumMaxNum = getActionDefinitionsBuilder({
681       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
682 
683   if (ST.hasVOP3PInsts()) {
684     MinNumMaxNum.customFor(FPTypesPK16)
685       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
686       .clampMaxNumElements(0, S16, 2)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else if (ST.has16BitInsts()) {
690     MinNumMaxNum.customFor(FPTypes16)
691       .clampScalar(0, S16, S64)
692       .scalarize(0);
693   } else {
694     MinNumMaxNum.customFor(FPTypesBase)
695       .clampScalar(0, S32, S64)
696       .scalarize(0);
697   }
698 
699   if (ST.hasVOP3PInsts())
700     FPOpActions.clampMaxNumElements(0, S16, 2);
701 
702   FPOpActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   TrigActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   FDIVActions
711     .scalarize(0)
712     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
713 
714   getActionDefinitionsBuilder({G_FNEG, G_FABS})
715     .legalFor(FPTypesPK16)
716     .clampMaxNumElements(0, S16, 2)
717     .scalarize(0)
718     .clampScalar(0, S16, S64);
719 
720   if (ST.has16BitInsts()) {
721     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
722       .legalFor({S32, S64, S16})
723       .scalarize(0)
724       .clampScalar(0, S16, S64);
725   } else {
726     getActionDefinitionsBuilder(G_FSQRT)
727       .legalFor({S32, S64})
728       .scalarize(0)
729       .clampScalar(0, S32, S64);
730 
731     if (ST.hasFractBug()) {
732       getActionDefinitionsBuilder(G_FFLOOR)
733         .customFor({S64})
734         .legalFor({S32, S64})
735         .scalarize(0)
736         .clampScalar(0, S32, S64);
737     } else {
738       getActionDefinitionsBuilder(G_FFLOOR)
739         .legalFor({S32, S64})
740         .scalarize(0)
741         .clampScalar(0, S32, S64);
742     }
743   }
744 
745   getActionDefinitionsBuilder(G_FPTRUNC)
746     .legalFor({{S32, S64}, {S16, S32}})
747     .scalarize(0)
748     .lower();
749 
750   getActionDefinitionsBuilder(G_FPEXT)
751     .legalFor({{S64, S32}, {S32, S16}})
752     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
753     .scalarize(0);
754 
755   getActionDefinitionsBuilder(G_FSUB)
756       // Use actual fsub instruction
757       .legalFor({S32})
758       // Must use fadd + fneg
759       .lowerFor({S64, S16, V2S16})
760       .scalarize(0)
761       .clampScalar(0, S32, S64);
762 
763   // Whether this is legal depends on the floating point mode for the function.
764   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
765   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
766     FMad.customFor({S32, S16});
767   else if (ST.hasMadMacF32Insts())
768     FMad.customFor({S32});
769   else if (ST.hasMadF16())
770     FMad.customFor({S16});
771   FMad.scalarize(0)
772       .lower();
773 
774   auto &FRem = getActionDefinitionsBuilder(G_FREM);
775   if (ST.has16BitInsts()) {
776     FRem.customFor({S16, S32, S64});
777   } else {
778     FRem.minScalar(0, S32)
779         .customFor({S32, S64});
780   }
781   FRem.scalarize(0);
782 
783   // TODO: Do we need to clamp maximum bitwidth?
784   getActionDefinitionsBuilder(G_TRUNC)
785     .legalIf(isScalar(0))
786     .legalFor({{V2S16, V2S32}})
787     .clampMaxNumElements(0, S16, 2)
788     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
789     // situations (like an invalid implicit use), we don't want to infinite loop
790     // in the legalizer.
791     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
792     .alwaysLegal();
793 
794   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
795     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
796                {S32, S1}, {S64, S1}, {S16, S1}})
797     .scalarize(0)
798     .clampScalar(0, S32, S64)
799     .widenScalarToNextPow2(1, 32);
800 
801   // TODO: Split s1->s64 during regbankselect for VALU.
802   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
803     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
804     .lowerFor({{S32, S64}})
805     .lowerIf(typeIs(1, S1))
806     .customFor({{S64, S64}});
807   if (ST.has16BitInsts())
808     IToFP.legalFor({{S16, S16}});
809   IToFP.clampScalar(1, S32, S64)
810        .minScalar(0, S32)
811        .scalarize(0)
812        .widenScalarToNextPow2(1);
813 
814   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
815     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
816     .customFor({{S64, S64}})
817     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
818   if (ST.has16BitInsts())
819     FPToI.legalFor({{S16, S16}});
820   else
821     FPToI.minScalar(1, S32);
822 
823   FPToI.minScalar(0, S32)
824        .scalarize(0)
825        .lower();
826 
827   // Lower roundeven into G_FRINT
828   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
829     .scalarize(0)
830     .lower();
831 
832   if (ST.has16BitInsts()) {
833     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
834       .legalFor({S16, S32, S64})
835       .clampScalar(0, S16, S64)
836       .scalarize(0);
837   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
838     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
839       .legalFor({S32, S64})
840       .clampScalar(0, S32, S64)
841       .scalarize(0);
842   } else {
843     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
844       .legalFor({S32})
845       .customFor({S64})
846       .clampScalar(0, S32, S64)
847       .scalarize(0);
848   }
849 
850   getActionDefinitionsBuilder(G_PTR_ADD)
851     .legalIf(all(isPointer(0), sameSize(0, 1)))
852     .scalarize(0)
853     .scalarSameSizeAs(1, 0);
854 
855   getActionDefinitionsBuilder(G_PTRMASK)
856     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
857     .scalarSameSizeAs(1, 0)
858     .scalarize(0);
859 
860   auto &CmpBuilder =
861     getActionDefinitionsBuilder(G_ICMP)
862     // The compare output type differs based on the register bank of the output,
863     // so make both s1 and s32 legal.
864     //
865     // Scalar compares producing output in scc will be promoted to s32, as that
866     // is the allocatable register type that will be needed for the copy from
867     // scc. This will be promoted during RegBankSelect, and we assume something
868     // before that won't try to use s32 result types.
869     //
870     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
871     // bank.
872     .legalForCartesianProduct(
873       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
874     .legalForCartesianProduct(
875       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
876   if (ST.has16BitInsts()) {
877     CmpBuilder.legalFor({{S1, S16}});
878   }
879 
880   CmpBuilder
881     .widenScalarToNextPow2(1)
882     .clampScalar(1, S32, S64)
883     .scalarize(0)
884     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
885 
886   getActionDefinitionsBuilder(G_FCMP)
887     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
888     .widenScalarToNextPow2(1)
889     .clampScalar(1, S32, S64)
890     .scalarize(0);
891 
892   // FIXME: fpow has a selection pattern that should move to custom lowering.
893   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
894   if (ST.has16BitInsts())
895     Exp2Ops.legalFor({S32, S16});
896   else
897     Exp2Ops.legalFor({S32});
898   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
899   Exp2Ops.scalarize(0);
900 
901   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
902   if (ST.has16BitInsts())
903     ExpOps.customFor({{S32}, {S16}});
904   else
905     ExpOps.customFor({S32});
906   ExpOps.clampScalar(0, MinScalarFPTy, S32)
907         .scalarize(0);
908 
909   getActionDefinitionsBuilder(G_FPOWI)
910     .clampScalar(0, MinScalarFPTy, S32)
911     .lower();
912 
913   // The 64-bit versions produce 32-bit results, but only on the SALU.
914   getActionDefinitionsBuilder(G_CTPOP)
915     .legalFor({{S32, S32}, {S32, S64}})
916     .clampScalar(0, S32, S32)
917     .clampScalar(1, S32, S64)
918     .scalarize(0)
919     .widenScalarToNextPow2(0, 32)
920     .widenScalarToNextPow2(1, 32);
921 
922   // The hardware instructions return a different result on 0 than the generic
923   // instructions expect. The hardware produces -1, but these produce the
924   // bitwidth.
925   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
926     .scalarize(0)
927     .clampScalar(0, S32, S32)
928     .clampScalar(1, S32, S64)
929     .widenScalarToNextPow2(0, 32)
930     .widenScalarToNextPow2(1, 32)
931     .lower();
932 
933   // The 64-bit versions produce 32-bit results, but only on the SALU.
934   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
935     .legalFor({{S32, S32}, {S32, S64}})
936     .clampScalar(0, S32, S32)
937     .clampScalar(1, S32, S64)
938     .scalarize(0)
939     .widenScalarToNextPow2(0, 32)
940     .widenScalarToNextPow2(1, 32);
941 
942   getActionDefinitionsBuilder(G_BITREVERSE)
943     .legalFor({S32})
944     .clampScalar(0, S32, S32)
945     .scalarize(0);
946 
947   if (ST.has16BitInsts()) {
948     getActionDefinitionsBuilder(G_BSWAP)
949       .legalFor({S16, S32, V2S16})
950       .clampMaxNumElements(0, S16, 2)
951       // FIXME: Fixing non-power-of-2 before clamp is workaround for
952       // narrowScalar limitation.
953       .widenScalarToNextPow2(0)
954       .clampScalar(0, S16, S32)
955       .scalarize(0);
956 
957     if (ST.hasVOP3PInsts()) {
958       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
959         .legalFor({S32, S16, V2S16})
960         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
961         .clampMaxNumElements(0, S16, 2)
962         .minScalar(0, S16)
963         .widenScalarToNextPow2(0)
964         .scalarize(0)
965         .lower();
966     } else {
967       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
968         .legalFor({S32, S16})
969         .widenScalarToNextPow2(0)
970         .minScalar(0, S16)
971         .scalarize(0)
972         .lower();
973     }
974   } else {
975     // TODO: Should have same legality without v_perm_b32
976     getActionDefinitionsBuilder(G_BSWAP)
977       .legalFor({S32})
978       .lowerIf(scalarNarrowerThan(0, 32))
979       // FIXME: Fixing non-power-of-2 before clamp is workaround for
980       // narrowScalar limitation.
981       .widenScalarToNextPow2(0)
982       .maxScalar(0, S32)
983       .scalarize(0)
984       .lower();
985 
986     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
987       .legalFor({S32})
988       .minScalar(0, S32)
989       .widenScalarToNextPow2(0)
990       .scalarize(0)
991       .lower();
992   }
993 
994   getActionDefinitionsBuilder(G_INTTOPTR)
995     // List the common cases
996     .legalForCartesianProduct(AddrSpaces64, {S64})
997     .legalForCartesianProduct(AddrSpaces32, {S32})
998     .scalarize(0)
999     // Accept any address space as long as the size matches
1000     .legalIf(sameSize(0, 1))
1001     .widenScalarIf(smallerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       })
1005     .narrowScalarIf(largerThan(1, 0),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1008       });
1009 
1010   getActionDefinitionsBuilder(G_PTRTOINT)
1011     // List the common cases
1012     .legalForCartesianProduct(AddrSpaces64, {S64})
1013     .legalForCartesianProduct(AddrSpaces32, {S32})
1014     .scalarize(0)
1015     // Accept any address space as long as the size matches
1016     .legalIf(sameSize(0, 1))
1017     .widenScalarIf(smallerThan(0, 1),
1018       [](const LegalityQuery &Query) {
1019         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1020       })
1021     .narrowScalarIf(
1022       largerThan(0, 1),
1023       [](const LegalityQuery &Query) {
1024         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1025       });
1026 
1027   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1028     .scalarize(0)
1029     .custom();
1030 
1031   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1032                                     bool IsLoad) -> bool {
1033     const LLT DstTy = Query.Types[0];
1034 
1035     // Split vector extloads.
1036     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1037     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1038 
1039     if (MemSize < DstTy.getSizeInBits())
1040       MemSize = std::max(MemSize, AlignBits);
1041 
1042     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1043       return true;
1044 
1045     const LLT PtrTy = Query.Types[1];
1046     unsigned AS = PtrTy.getAddressSpace();
1047     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1048       return true;
1049 
1050     // Catch weird sized loads that don't evenly divide into the access sizes
1051     // TODO: May be able to widen depending on alignment etc.
1052     unsigned NumRegs = (MemSize + 31) / 32;
1053     if (NumRegs == 3) {
1054       if (!ST.hasDwordx3LoadStores())
1055         return true;
1056     } else {
1057       // If the alignment allows, these should have been widened.
1058       if (!isPowerOf2_32(NumRegs))
1059         return true;
1060     }
1061 
1062     if (AlignBits < MemSize) {
1063       const SITargetLowering *TLI = ST.getTargetLowering();
1064       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1065                                                       Align(AlignBits / 8));
1066     }
1067 
1068     return false;
1069   };
1070 
1071   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1072   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1073   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1074 
1075   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1076   // LDS
1077   // TODO: Unsupported flat for SI.
1078 
1079   for (unsigned Op : {G_LOAD, G_STORE}) {
1080     const bool IsStore = Op == G_STORE;
1081 
1082     auto &Actions = getActionDefinitionsBuilder(Op);
1083     // Explicitly list some common cases.
1084     // TODO: Does this help compile time at all?
1085     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1086                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1087                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1088                                       {S64, GlobalPtr, 64, GlobalAlign32},
1089                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1090                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1091                                       {S32, GlobalPtr, 8, GlobalAlign8},
1092                                       {S32, GlobalPtr, 16, GlobalAlign16},
1093 
1094                                       {S32, LocalPtr, 32, 32},
1095                                       {S64, LocalPtr, 64, 32},
1096                                       {V2S32, LocalPtr, 64, 32},
1097                                       {S32, LocalPtr, 8, 8},
1098                                       {S32, LocalPtr, 16, 16},
1099                                       {V2S16, LocalPtr, 32, 32},
1100 
1101                                       {S32, PrivatePtr, 32, 32},
1102                                       {S32, PrivatePtr, 8, 8},
1103                                       {S32, PrivatePtr, 16, 16},
1104                                       {V2S16, PrivatePtr, 32, 32},
1105 
1106                                       {S32, ConstantPtr, 32, GlobalAlign32},
1107                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1108                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1109                                       {S64, ConstantPtr, 64, GlobalAlign32},
1110                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1111     Actions.legalIf(
1112       [=](const LegalityQuery &Query) -> bool {
1113         return isLoadStoreLegal(ST, Query, Op);
1114       });
1115 
1116     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1117     // 64-bits.
1118     //
1119     // TODO: Should generalize bitcast action into coerce, which will also cover
1120     // inserting addrspacecasts.
1121     Actions.customIf(typeIs(1, Constant32Ptr));
1122 
1123     // Turn any illegal element vectors into something easier to deal
1124     // with. These will ultimately produce 32-bit scalar shifts to extract the
1125     // parts anyway.
1126     //
1127     // For odd 16-bit element vectors, prefer to split those into pieces with
1128     // 16-bit vector parts.
1129     Actions.bitcastIf(
1130       [=](const LegalityQuery &Query) -> bool {
1131         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1132                                           Query.MMODescrs[0].SizeInBits);
1133       }, bitcastToRegisterType(0));
1134 
1135     if (!IsStore) {
1136       // Widen suitably aligned loads by loading extra bytes. The standard
1137       // legalization actions can't properly express widening memory operands.
1138       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1139         return shouldWidenLoad(ST, Query, G_LOAD);
1140       });
1141     }
1142 
1143     // FIXME: load/store narrowing should be moved to lower action
1144     Actions
1145         .narrowScalarIf(
1146             [=](const LegalityQuery &Query) -> bool {
1147               return !Query.Types[0].isVector() &&
1148                      needToSplitMemOp(Query, Op == G_LOAD);
1149             },
1150             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1151               const LLT DstTy = Query.Types[0];
1152               const LLT PtrTy = Query.Types[1];
1153 
1154               const unsigned DstSize = DstTy.getSizeInBits();
1155               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1156 
1157               // Split extloads.
1158               if (DstSize > MemSize)
1159                 return std::make_pair(0, LLT::scalar(MemSize));
1160 
1161               if (!isPowerOf2_32(DstSize)) {
1162                 // We're probably decomposing an odd sized store. Try to split
1163                 // to the widest type. TODO: Account for alignment. As-is it
1164                 // should be OK, since the new parts will be further legalized.
1165                 unsigned FloorSize = PowerOf2Floor(DstSize);
1166                 return std::make_pair(0, LLT::scalar(FloorSize));
1167               }
1168 
1169               if (DstSize > 32 && (DstSize % 32 != 0)) {
1170                 // FIXME: Need a way to specify non-extload of larger size if
1171                 // suitably aligned.
1172                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1173               }
1174 
1175               unsigned MaxSize = maxSizeForAddrSpace(ST,
1176                                                      PtrTy.getAddressSpace(),
1177                                                      Op == G_LOAD);
1178               if (MemSize > MaxSize)
1179                 return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181               unsigned Align = Query.MMODescrs[0].AlignInBits;
1182               return std::make_pair(0, LLT::scalar(Align));
1183             })
1184         .fewerElementsIf(
1185             [=](const LegalityQuery &Query) -> bool {
1186               return Query.Types[0].isVector() &&
1187                      needToSplitMemOp(Query, Op == G_LOAD);
1188             },
1189             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190               const LLT DstTy = Query.Types[0];
1191               const LLT PtrTy = Query.Types[1];
1192 
1193               LLT EltTy = DstTy.getElementType();
1194               unsigned MaxSize = maxSizeForAddrSpace(ST,
1195                                                      PtrTy.getAddressSpace(),
1196                                                      Op == G_LOAD);
1197 
1198               // FIXME: Handle widened to power of 2 results better. This ends
1199               // up scalarizing.
1200               // FIXME: 3 element stores scalarized on SI
1201 
1202               // Split if it's too large for the address space.
1203               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1204                 unsigned NumElts = DstTy.getNumElements();
1205                 unsigned EltSize = EltTy.getSizeInBits();
1206 
1207                 if (MaxSize % EltSize == 0) {
1208                   return std::make_pair(
1209                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1210                 }
1211 
1212                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1213 
1214                 // FIXME: Refine when odd breakdowns handled
1215                 // The scalars will need to be re-legalized.
1216                 if (NumPieces == 1 || NumPieces >= NumElts ||
1217                     NumElts % NumPieces != 0)
1218                   return std::make_pair(0, EltTy);
1219 
1220                 return std::make_pair(0,
1221                                       LLT::vector(NumElts / NumPieces, EltTy));
1222               }
1223 
1224               // FIXME: We could probably handle weird extending loads better.
1225               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1226               if (DstTy.getSizeInBits() > MemSize)
1227                 return std::make_pair(0, EltTy);
1228 
1229               unsigned EltSize = EltTy.getSizeInBits();
1230               unsigned DstSize = DstTy.getSizeInBits();
1231               if (!isPowerOf2_32(DstSize)) {
1232                 // We're probably decomposing an odd sized store. Try to split
1233                 // to the widest type. TODO: Account for alignment. As-is it
1234                 // should be OK, since the new parts will be further legalized.
1235                 unsigned FloorSize = PowerOf2Floor(DstSize);
1236                 return std::make_pair(
1237                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1238               }
1239 
1240               // Need to split because of alignment.
1241               unsigned Align = Query.MMODescrs[0].AlignInBits;
1242               if (EltSize > Align &&
1243                   (EltSize / Align < DstTy.getNumElements())) {
1244                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1245               }
1246 
1247               // May need relegalization for the scalars.
1248               return std::make_pair(0, EltTy);
1249             })
1250     .lowerIfMemSizeNotPow2()
1251     .minScalar(0, S32);
1252 
1253     if (IsStore)
1254       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1255 
1256     Actions
1257         .widenScalarToNextPow2(0)
1258         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1259         .lower();
1260   }
1261 
1262   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1263                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1264                                                   {S32, GlobalPtr, 16, 2 * 8},
1265                                                   {S32, LocalPtr, 8, 8},
1266                                                   {S32, LocalPtr, 16, 16},
1267                                                   {S32, PrivatePtr, 8, 8},
1268                                                   {S32, PrivatePtr, 16, 16},
1269                                                   {S32, ConstantPtr, 8, 8},
1270                                                   {S32, ConstantPtr, 16, 2 * 8}});
1271   if (ST.hasFlatAddressSpace()) {
1272     ExtLoads.legalForTypesWithMemDesc(
1273         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1274   }
1275 
1276   ExtLoads.clampScalar(0, S32, S32)
1277           .widenScalarToNextPow2(0)
1278           .unsupportedIfMemSizeNotPow2()
1279           .lower();
1280 
1281   auto &Atomics = getActionDefinitionsBuilder(
1282     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1283      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1284      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1285      G_ATOMICRMW_UMIN})
1286     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1287                {S64, GlobalPtr}, {S64, LocalPtr},
1288                {S32, RegionPtr}, {S64, RegionPtr}});
1289   if (ST.hasFlatAddressSpace()) {
1290     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1291   }
1292 
1293   if (ST.hasLDSFPAtomics()) {
1294     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1295       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1296   }
1297 
1298   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299   // demarshalling
1300   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302                 {S32, FlatPtr}, {S64, FlatPtr}})
1303     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304                {S32, RegionPtr}, {S64, RegionPtr}});
1305   // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307   // Condition should be s32 for scalar, s1 for vector.
1308   getActionDefinitionsBuilder(G_SELECT)
1309     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1310           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1311           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1312     .clampScalar(0, S16, S64)
1313     .scalarize(1)
1314     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1315     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1316     .clampMaxNumElements(0, S32, 2)
1317     .clampMaxNumElements(0, LocalPtr, 2)
1318     .clampMaxNumElements(0, PrivatePtr, 2)
1319     .scalarize(0)
1320     .widenScalarToNextPow2(0)
1321     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1322 
1323   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1324   // be more flexible with the shift amount type.
1325   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1326     .legalFor({{S32, S32}, {S64, S32}});
1327   if (ST.has16BitInsts()) {
1328     if (ST.hasVOP3PInsts()) {
1329       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1330             .clampMaxNumElements(0, S16, 2);
1331     } else
1332       Shifts.legalFor({{S16, S16}});
1333 
1334     // TODO: Support 16-bit shift amounts for all types
1335     Shifts.widenScalarIf(
1336       [=](const LegalityQuery &Query) {
1337         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1338         // 32-bit amount.
1339         const LLT ValTy = Query.Types[0];
1340         const LLT AmountTy = Query.Types[1];
1341         return ValTy.getSizeInBits() <= 16 &&
1342                AmountTy.getSizeInBits() < 16;
1343       }, changeTo(1, S16));
1344     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1345     Shifts.clampScalar(1, S32, S32);
1346     Shifts.clampScalar(0, S16, S64);
1347     Shifts.widenScalarToNextPow2(0, 16);
1348 
1349     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1350       .minScalar(0, S16)
1351       .scalarize(0)
1352       .lower();
1353   } else {
1354     // Make sure we legalize the shift amount type first, as the general
1355     // expansion for the shifted type will produce much worse code if it hasn't
1356     // been truncated already.
1357     Shifts.clampScalar(1, S32, S32);
1358     Shifts.clampScalar(0, S32, S64);
1359     Shifts.widenScalarToNextPow2(0, 32);
1360 
1361     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1362       .minScalar(0, S32)
1363       .scalarize(0)
1364       .lower();
1365   }
1366   Shifts.scalarize(0);
1367 
1368   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1369     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1370     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1371     unsigned IdxTypeIdx = 2;
1372 
1373     getActionDefinitionsBuilder(Op)
1374       .customIf([=](const LegalityQuery &Query) {
1375           const LLT EltTy = Query.Types[EltTypeIdx];
1376           const LLT VecTy = Query.Types[VecTypeIdx];
1377           const LLT IdxTy = Query.Types[IdxTypeIdx];
1378           const unsigned EltSize = EltTy.getSizeInBits();
1379           return (EltSize == 32 || EltSize == 64) &&
1380                   VecTy.getSizeInBits() % 32 == 0 &&
1381                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1382                   IdxTy.getSizeInBits() == 32;
1383         })
1384       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1385                  bitcastToVectorElement32(VecTypeIdx))
1386       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1387       .bitcastIf(
1388         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1389         [=](const LegalityQuery &Query) {
1390           // For > 64-bit element types, try to turn this into a 64-bit
1391           // element vector since we may be able to do better indexing
1392           // if this is scalar. If not, fall back to 32.
1393           const LLT EltTy = Query.Types[EltTypeIdx];
1394           const LLT VecTy = Query.Types[VecTypeIdx];
1395           const unsigned DstEltSize = EltTy.getSizeInBits();
1396           const unsigned VecSize = VecTy.getSizeInBits();
1397 
1398           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1399           return std::make_pair(
1400             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1401         })
1402       .clampScalar(EltTypeIdx, S32, S64)
1403       .clampScalar(VecTypeIdx, S32, S64)
1404       .clampScalar(IdxTypeIdx, S32, S32)
1405       .clampMaxNumElements(VecTypeIdx, S32, 32)
1406       // TODO: Clamp elements for 64-bit vectors?
1407       // It should only be necessary with variable indexes.
1408       // As a last resort, lower to the stack
1409       .lower();
1410   }
1411 
1412   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1413     .unsupportedIf([=](const LegalityQuery &Query) {
1414         const LLT &EltTy = Query.Types[1].getElementType();
1415         return Query.Types[0] != EltTy;
1416       });
1417 
1418   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1419     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1420     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1421 
1422     // FIXME: Doesn't handle extract of illegal sizes.
1423     getActionDefinitionsBuilder(Op)
1424       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1425       // FIXME: Multiples of 16 should not be legal.
1426       .legalIf([=](const LegalityQuery &Query) {
1427           const LLT BigTy = Query.Types[BigTyIdx];
1428           const LLT LitTy = Query.Types[LitTyIdx];
1429           return (BigTy.getSizeInBits() % 32 == 0) &&
1430                  (LitTy.getSizeInBits() % 16 == 0);
1431         })
1432       .widenScalarIf(
1433         [=](const LegalityQuery &Query) {
1434           const LLT BigTy = Query.Types[BigTyIdx];
1435           return (BigTy.getScalarSizeInBits() < 16);
1436         },
1437         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1438       .widenScalarIf(
1439         [=](const LegalityQuery &Query) {
1440           const LLT LitTy = Query.Types[LitTyIdx];
1441           return (LitTy.getScalarSizeInBits() < 16);
1442         },
1443         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1444       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1445       .widenScalarToNextPow2(BigTyIdx, 32);
1446 
1447   }
1448 
1449   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1450     .legalForCartesianProduct(AllS32Vectors, {S32})
1451     .legalForCartesianProduct(AllS64Vectors, {S64})
1452     .clampNumElements(0, V16S32, V32S32)
1453     .clampNumElements(0, V2S64, V16S64)
1454     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1455 
1456   if (ST.hasScalarPackInsts()) {
1457     BuildVector
1458       // FIXME: Should probably widen s1 vectors straight to s32
1459       .minScalarOrElt(0, S16)
1460       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1461       .minScalar(1, S32);
1462 
1463     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1464       .legalFor({V2S16, S32})
1465       .lower();
1466     BuildVector.minScalarOrElt(0, S32);
1467   } else {
1468     BuildVector.customFor({V2S16, S16});
1469     BuildVector.minScalarOrElt(0, S32);
1470 
1471     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1472       .customFor({V2S16, S32})
1473       .lower();
1474   }
1475 
1476   BuildVector.legalIf(isRegisterType(0));
1477 
1478   // FIXME: Clamp maximum size
1479   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1480     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1481     .clampMaxNumElements(0, S32, 32)
1482     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1483     .clampMaxNumElements(0, S16, 64);
1484 
1485   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1486   // pre-legalize.
1487   if (ST.hasVOP3PInsts()) {
1488     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1489       .customFor({V2S16, V2S16})
1490       .lower();
1491   } else
1492     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1493 
1494   // Merge/Unmerge
1495   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1496     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1497     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1498 
1499     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1500       const LLT Ty = Query.Types[TypeIdx];
1501       if (Ty.isVector()) {
1502         const LLT &EltTy = Ty.getElementType();
1503         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1504           return true;
1505         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1506           return true;
1507       }
1508       return false;
1509     };
1510 
1511     auto &Builder = getActionDefinitionsBuilder(Op)
1512       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513       .lowerFor({{S16, V2S16}})
1514       .lowerIf([=](const LegalityQuery &Query) {
1515           const LLT BigTy = Query.Types[BigTyIdx];
1516           return BigTy.getSizeInBits() == 32;
1517         })
1518       // Try to widen to s16 first for small types.
1519       // TODO: Only do this on targets with legal s16 shifts
1520       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1521       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1522       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1523       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1524                            elementTypeIs(1, S16)),
1525                        changeTo(1, V2S16))
1526       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1527       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1528       // valid.
1529       .clampScalar(LitTyIdx, S32, S512)
1530       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1531       // Break up vectors with weird elements into scalars
1532       .fewerElementsIf(
1533         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1534         scalarize(0))
1535       .fewerElementsIf(
1536         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1537         scalarize(1))
1538       .clampScalar(BigTyIdx, S32, MaxScalar);
1539 
1540     if (Op == G_MERGE_VALUES) {
1541       Builder.widenScalarIf(
1542         // TODO: Use 16-bit shifts if legal for 8-bit values?
1543         [=](const LegalityQuery &Query) {
1544           const LLT Ty = Query.Types[LitTyIdx];
1545           return Ty.getSizeInBits() < 32;
1546         },
1547         changeTo(LitTyIdx, S32));
1548     }
1549 
1550     Builder.widenScalarIf(
1551       [=](const LegalityQuery &Query) {
1552         const LLT Ty = Query.Types[BigTyIdx];
1553         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1554           Ty.getSizeInBits() % 16 != 0;
1555       },
1556       [=](const LegalityQuery &Query) {
1557         // Pick the next power of 2, or a multiple of 64 over 128.
1558         // Whichever is smaller.
1559         const LLT &Ty = Query.Types[BigTyIdx];
1560         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1561         if (NewSizeInBits >= 256) {
1562           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1563           if (RoundedTo < NewSizeInBits)
1564             NewSizeInBits = RoundedTo;
1565         }
1566         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1567       })
1568       // Any vectors left are the wrong size. Scalarize them.
1569       .scalarize(0)
1570       .scalarize(1);
1571   }
1572 
1573   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1574   // RegBankSelect.
1575   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1576     .legalFor({{S32}, {S64}});
1577 
1578   if (ST.hasVOP3PInsts()) {
1579     SextInReg.lowerFor({{V2S16}})
1580       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1581       // get more vector shift opportunities, since we'll get those when
1582       // expanded.
1583       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1584   } else if (ST.has16BitInsts()) {
1585     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1586   } else {
1587     // Prefer to promote to s32 before lowering if we don't have 16-bit
1588     // shifts. This avoid a lot of intermediate truncate and extend operations.
1589     SextInReg.lowerFor({{S32}, {S64}});
1590   }
1591 
1592   SextInReg
1593     .scalarize(0)
1594     .clampScalar(0, S32, S64)
1595     .lower();
1596 
1597   getActionDefinitionsBuilder(G_FSHR)
1598     .legalFor({{S32, S32}})
1599     .scalarize(0)
1600     .lower();
1601 
1602   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1603     .legalFor({S64});
1604 
1605   getActionDefinitionsBuilder(G_FENCE)
1606     .alwaysLegal();
1607 
1608   getActionDefinitionsBuilder({
1609       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1610       G_FCOPYSIGN,
1611 
1612       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1613       G_ATOMICRMW_NAND,
1614       G_ATOMICRMW_FSUB,
1615       G_READ_REGISTER,
1616       G_WRITE_REGISTER,
1617 
1618       G_SADDO, G_SSUBO,
1619 
1620        // TODO: Implement
1621       G_FMINIMUM, G_FMAXIMUM,
1622       G_FSHL
1623     }).lower();
1624 
1625   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1626         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1627         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1628     .unsupported();
1629 
1630   computeTables();
1631   verify(*ST.getInstrInfo());
1632 }
1633 
1634 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1635                                          MachineInstr &MI) const {
1636   MachineIRBuilder &B = Helper.MIRBuilder;
1637   MachineRegisterInfo &MRI = *B.getMRI();
1638 
1639   switch (MI.getOpcode()) {
1640   case TargetOpcode::G_ADDRSPACE_CAST:
1641     return legalizeAddrSpaceCast(MI, MRI, B);
1642   case TargetOpcode::G_FRINT:
1643     return legalizeFrint(MI, MRI, B);
1644   case TargetOpcode::G_FCEIL:
1645     return legalizeFceil(MI, MRI, B);
1646   case TargetOpcode::G_FREM:
1647     return legalizeFrem(MI, MRI, B);
1648   case TargetOpcode::G_INTRINSIC_TRUNC:
1649     return legalizeIntrinsicTrunc(MI, MRI, B);
1650   case TargetOpcode::G_SITOFP:
1651     return legalizeITOFP(MI, MRI, B, true);
1652   case TargetOpcode::G_UITOFP:
1653     return legalizeITOFP(MI, MRI, B, false);
1654   case TargetOpcode::G_FPTOSI:
1655     return legalizeFPTOI(MI, MRI, B, true);
1656   case TargetOpcode::G_FPTOUI:
1657     return legalizeFPTOI(MI, MRI, B, false);
1658   case TargetOpcode::G_FMINNUM:
1659   case TargetOpcode::G_FMAXNUM:
1660   case TargetOpcode::G_FMINNUM_IEEE:
1661   case TargetOpcode::G_FMAXNUM_IEEE:
1662     return legalizeMinNumMaxNum(Helper, MI);
1663   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1664     return legalizeExtractVectorElt(MI, MRI, B);
1665   case TargetOpcode::G_INSERT_VECTOR_ELT:
1666     return legalizeInsertVectorElt(MI, MRI, B);
1667   case TargetOpcode::G_SHUFFLE_VECTOR:
1668     return legalizeShuffleVector(MI, MRI, B);
1669   case TargetOpcode::G_FSIN:
1670   case TargetOpcode::G_FCOS:
1671     return legalizeSinCos(MI, MRI, B);
1672   case TargetOpcode::G_GLOBAL_VALUE:
1673     return legalizeGlobalValue(MI, MRI, B);
1674   case TargetOpcode::G_LOAD:
1675     return legalizeLoad(Helper, MI);
1676   case TargetOpcode::G_FMAD:
1677     return legalizeFMad(MI, MRI, B);
1678   case TargetOpcode::G_FDIV:
1679     return legalizeFDIV(MI, MRI, B);
1680   case TargetOpcode::G_UDIV:
1681   case TargetOpcode::G_UREM:
1682     return legalizeUDIV_UREM(MI, MRI, B);
1683   case TargetOpcode::G_SDIV:
1684   case TargetOpcode::G_SREM:
1685     return legalizeSDIV_SREM(MI, MRI, B);
1686   case TargetOpcode::G_ATOMIC_CMPXCHG:
1687     return legalizeAtomicCmpXChg(MI, MRI, B);
1688   case TargetOpcode::G_FLOG:
1689     return legalizeFlog(MI, B, numbers::ln2f);
1690   case TargetOpcode::G_FLOG10:
1691     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1692   case TargetOpcode::G_FEXP:
1693     return legalizeFExp(MI, B);
1694   case TargetOpcode::G_FPOW:
1695     return legalizeFPow(MI, B);
1696   case TargetOpcode::G_FFLOOR:
1697     return legalizeFFloor(MI, MRI, B);
1698   case TargetOpcode::G_BUILD_VECTOR:
1699     return legalizeBuildVector(MI, MRI, B);
1700   default:
1701     return false;
1702   }
1703 
1704   llvm_unreachable("expected switch to return");
1705 }
1706 
1707 Register AMDGPULegalizerInfo::getSegmentAperture(
1708   unsigned AS,
1709   MachineRegisterInfo &MRI,
1710   MachineIRBuilder &B) const {
1711   MachineFunction &MF = B.getMF();
1712   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1713   const LLT S32 = LLT::scalar(32);
1714 
1715   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1716 
1717   if (ST.hasApertureRegs()) {
1718     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1719     // getreg.
1720     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1721         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1722         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1723     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1724         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1725         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1726     unsigned Encoding =
1727         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1728         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1729         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1730 
1731     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1732 
1733     B.buildInstr(AMDGPU::S_GETREG_B32)
1734       .addDef(GetReg)
1735       .addImm(Encoding);
1736     MRI.setType(GetReg, S32);
1737 
1738     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1739     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1740   }
1741 
1742   Register QueuePtr = MRI.createGenericVirtualRegister(
1743     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1744 
1745   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1746     return Register();
1747 
1748   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1749   // private_segment_aperture_base_hi.
1750   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1751 
1752   // TODO: can we be smarter about machine pointer info?
1753   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1754   MachineMemOperand *MMO = MF.getMachineMemOperand(
1755       PtrInfo,
1756       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1757           MachineMemOperand::MOInvariant,
1758       4, commonAlignment(Align(64), StructOffset));
1759 
1760   Register LoadAddr;
1761 
1762   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1763   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1764 }
1765 
1766 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1767   MachineInstr &MI, MachineRegisterInfo &MRI,
1768   MachineIRBuilder &B) const {
1769   MachineFunction &MF = B.getMF();
1770 
1771   const LLT S32 = LLT::scalar(32);
1772   Register Dst = MI.getOperand(0).getReg();
1773   Register Src = MI.getOperand(1).getReg();
1774 
1775   LLT DstTy = MRI.getType(Dst);
1776   LLT SrcTy = MRI.getType(Src);
1777   unsigned DestAS = DstTy.getAddressSpace();
1778   unsigned SrcAS = SrcTy.getAddressSpace();
1779 
1780   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1781   // vector element.
1782   assert(!DstTy.isVector());
1783 
1784   const AMDGPUTargetMachine &TM
1785     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1786 
1787   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1788     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1789     return true;
1790   }
1791 
1792   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1793     // Truncate.
1794     B.buildExtract(Dst, Src, 0);
1795     MI.eraseFromParent();
1796     return true;
1797   }
1798 
1799   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1800     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1801     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1802 
1803     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1804     // another. Merge operands are required to be the same type, but creating an
1805     // extra ptrtoint would be kind of pointless.
1806     auto HighAddr = B.buildConstant(
1807       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1808     B.buildMerge(Dst, {Src, HighAddr});
1809     MI.eraseFromParent();
1810     return true;
1811   }
1812 
1813   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1814     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1815            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1816     unsigned NullVal = TM.getNullPointerValue(DestAS);
1817 
1818     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1819     auto FlatNull = B.buildConstant(SrcTy, 0);
1820 
1821     // Extract low 32-bits of the pointer.
1822     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1823 
1824     auto CmpRes =
1825         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1826     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1827 
1828     MI.eraseFromParent();
1829     return true;
1830   }
1831 
1832   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1833     return false;
1834 
1835   if (!ST.hasFlatAddressSpace())
1836     return false;
1837 
1838   auto SegmentNull =
1839       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1840   auto FlatNull =
1841       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1842 
1843   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1844   if (!ApertureReg.isValid())
1845     return false;
1846 
1847   auto CmpRes =
1848       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1849 
1850   // Coerce the type of the low half of the result so we can use merge_values.
1851   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1852 
1853   // TODO: Should we allow mismatched types but matching sizes in merges to
1854   // avoid the ptrtoint?
1855   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1856   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1857 
1858   MI.eraseFromParent();
1859   return true;
1860 }
1861 
1862 bool AMDGPULegalizerInfo::legalizeFrint(
1863   MachineInstr &MI, MachineRegisterInfo &MRI,
1864   MachineIRBuilder &B) const {
1865   Register Src = MI.getOperand(1).getReg();
1866   LLT Ty = MRI.getType(Src);
1867   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1868 
1869   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1870   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1871 
1872   auto C1 = B.buildFConstant(Ty, C1Val);
1873   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1874 
1875   // TODO: Should this propagate fast-math-flags?
1876   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1877   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1878 
1879   auto C2 = B.buildFConstant(Ty, C2Val);
1880   auto Fabs = B.buildFAbs(Ty, Src);
1881 
1882   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1883   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1884   MI.eraseFromParent();
1885   return true;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::legalizeFceil(
1889   MachineInstr &MI, MachineRegisterInfo &MRI,
1890   MachineIRBuilder &B) const {
1891 
1892   const LLT S1 = LLT::scalar(1);
1893   const LLT S64 = LLT::scalar(64);
1894 
1895   Register Src = MI.getOperand(1).getReg();
1896   assert(MRI.getType(Src) == S64);
1897 
1898   // result = trunc(src)
1899   // if (src > 0.0 && src != result)
1900   //   result += 1.0
1901 
1902   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1903 
1904   const auto Zero = B.buildFConstant(S64, 0.0);
1905   const auto One = B.buildFConstant(S64, 1.0);
1906   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1907   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1908   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1909   auto Add = B.buildSelect(S64, And, One, Zero);
1910 
1911   // TODO: Should this propagate fast-math-flags?
1912   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1913   return true;
1914 }
1915 
1916 bool AMDGPULegalizerInfo::legalizeFrem(
1917   MachineInstr &MI, MachineRegisterInfo &MRI,
1918   MachineIRBuilder &B) const {
1919     Register DstReg = MI.getOperand(0).getReg();
1920     Register Src0Reg = MI.getOperand(1).getReg();
1921     Register Src1Reg = MI.getOperand(2).getReg();
1922     auto Flags = MI.getFlags();
1923     LLT Ty = MRI.getType(DstReg);
1924 
1925     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1926     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1927     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1928     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1929     MI.eraseFromParent();
1930     return true;
1931 }
1932 
1933 static MachineInstrBuilder extractF64Exponent(Register Hi,
1934                                               MachineIRBuilder &B) {
1935   const unsigned FractBits = 52;
1936   const unsigned ExpBits = 11;
1937   LLT S32 = LLT::scalar(32);
1938 
1939   auto Const0 = B.buildConstant(S32, FractBits - 32);
1940   auto Const1 = B.buildConstant(S32, ExpBits);
1941 
1942   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1943     .addUse(Hi)
1944     .addUse(Const0.getReg(0))
1945     .addUse(Const1.getReg(0));
1946 
1947   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1948 }
1949 
1950 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1951   MachineInstr &MI, MachineRegisterInfo &MRI,
1952   MachineIRBuilder &B) const {
1953   const LLT S1 = LLT::scalar(1);
1954   const LLT S32 = LLT::scalar(32);
1955   const LLT S64 = LLT::scalar(64);
1956 
1957   Register Src = MI.getOperand(1).getReg();
1958   assert(MRI.getType(Src) == S64);
1959 
1960   // TODO: Should this use extract since the low half is unused?
1961   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1962   Register Hi = Unmerge.getReg(1);
1963 
1964   // Extract the upper half, since this is where we will find the sign and
1965   // exponent.
1966   auto Exp = extractF64Exponent(Hi, B);
1967 
1968   const unsigned FractBits = 52;
1969 
1970   // Extract the sign bit.
1971   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1972   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1973 
1974   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1975 
1976   const auto Zero32 = B.buildConstant(S32, 0);
1977 
1978   // Extend back to 64-bits.
1979   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1980 
1981   auto Shr = B.buildAShr(S64, FractMask, Exp);
1982   auto Not = B.buildNot(S64, Shr);
1983   auto Tmp0 = B.buildAnd(S64, Src, Not);
1984   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1985 
1986   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1987   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1988 
1989   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1990   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1991   MI.eraseFromParent();
1992   return true;
1993 }
1994 
1995 bool AMDGPULegalizerInfo::legalizeITOFP(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B, bool Signed) const {
1998 
1999   Register Dst = MI.getOperand(0).getReg();
2000   Register Src = MI.getOperand(1).getReg();
2001 
2002   const LLT S64 = LLT::scalar(64);
2003   const LLT S32 = LLT::scalar(32);
2004 
2005   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2006 
2007   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2008 
2009   auto CvtHi = Signed ?
2010     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2011     B.buildUITOFP(S64, Unmerge.getReg(1));
2012 
2013   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2014 
2015   auto ThirtyTwo = B.buildConstant(S32, 32);
2016   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2017     .addUse(CvtHi.getReg(0))
2018     .addUse(ThirtyTwo.getReg(0));
2019 
2020   // TODO: Should this propagate fast-math-flags?
2021   B.buildFAdd(Dst, LdExp, CvtLo);
2022   MI.eraseFromParent();
2023   return true;
2024 }
2025 
2026 // TODO: Copied from DAG implementation. Verify logic and document how this
2027 // actually works.
2028 bool AMDGPULegalizerInfo::legalizeFPTOI(
2029   MachineInstr &MI, MachineRegisterInfo &MRI,
2030   MachineIRBuilder &B, bool Signed) const {
2031 
2032   Register Dst = MI.getOperand(0).getReg();
2033   Register Src = MI.getOperand(1).getReg();
2034 
2035   const LLT S64 = LLT::scalar(64);
2036   const LLT S32 = LLT::scalar(32);
2037 
2038   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2039 
2040   unsigned Flags = MI.getFlags();
2041 
2042   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2043   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2044   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2045 
2046   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2047   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2048   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2049 
2050   auto Hi = Signed ?
2051     B.buildFPTOSI(S32, FloorMul) :
2052     B.buildFPTOUI(S32, FloorMul);
2053   auto Lo = B.buildFPTOUI(S32, Fma);
2054 
2055   B.buildMerge(Dst, { Lo, Hi });
2056   MI.eraseFromParent();
2057 
2058   return true;
2059 }
2060 
2061 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2062                                                MachineInstr &MI) const {
2063   MachineFunction &MF = Helper.MIRBuilder.getMF();
2064   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2065 
2066   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2067                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2068 
2069   // With ieee_mode disabled, the instructions have the correct behavior
2070   // already for G_FMINNUM/G_FMAXNUM
2071   if (!MFI->getMode().IEEE)
2072     return !IsIEEEOp;
2073 
2074   if (IsIEEEOp)
2075     return true;
2076 
2077   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2081   MachineInstr &MI, MachineRegisterInfo &MRI,
2082   MachineIRBuilder &B) const {
2083   // TODO: Should move some of this into LegalizerHelper.
2084 
2085   // TODO: Promote dynamic indexing of s16 to s32
2086 
2087   // FIXME: Artifact combiner probably should have replaced the truncated
2088   // constant before this, so we shouldn't need
2089   // getConstantVRegValWithLookThrough.
2090   Optional<ValueAndVReg> MaybeIdxVal =
2091       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2092   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2093     return true;
2094   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Vec = MI.getOperand(1).getReg();
2098 
2099   LLT VecTy = MRI.getType(Vec);
2100   LLT EltTy = VecTy.getElementType();
2101   assert(EltTy == MRI.getType(Dst));
2102 
2103   if (IdxVal < VecTy.getNumElements())
2104     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2105   else
2106     B.buildUndef(Dst);
2107 
2108   MI.eraseFromParent();
2109   return true;
2110 }
2111 
2112 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2113   MachineInstr &MI, MachineRegisterInfo &MRI,
2114   MachineIRBuilder &B) const {
2115   // TODO: Should move some of this into LegalizerHelper.
2116 
2117   // TODO: Promote dynamic indexing of s16 to s32
2118 
2119   // FIXME: Artifact combiner probably should have replaced the truncated
2120   // constant before this, so we shouldn't need
2121   // getConstantVRegValWithLookThrough.
2122   Optional<ValueAndVReg> MaybeIdxVal =
2123       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2124   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2125     return true;
2126 
2127   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2128   Register Dst = MI.getOperand(0).getReg();
2129   Register Vec = MI.getOperand(1).getReg();
2130   Register Ins = MI.getOperand(2).getReg();
2131 
2132   LLT VecTy = MRI.getType(Vec);
2133   LLT EltTy = VecTy.getElementType();
2134   assert(EltTy == MRI.getType(Ins));
2135 
2136   if (IdxVal < VecTy.getNumElements())
2137     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2138   else
2139     B.buildUndef(Dst);
2140 
2141   MI.eraseFromParent();
2142   return true;
2143 }
2144 
2145 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2146   MachineInstr &MI, MachineRegisterInfo &MRI,
2147   MachineIRBuilder &B) const {
2148   const LLT V2S16 = LLT::vector(2, 16);
2149 
2150   Register Dst = MI.getOperand(0).getReg();
2151   Register Src0 = MI.getOperand(1).getReg();
2152   LLT DstTy = MRI.getType(Dst);
2153   LLT SrcTy = MRI.getType(Src0);
2154 
2155   if (SrcTy == V2S16 && DstTy == V2S16 &&
2156       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2157     return true;
2158 
2159   MachineIRBuilder HelperBuilder(MI);
2160   GISelObserverWrapper DummyObserver;
2161   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2162   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2163 }
2164 
2165 bool AMDGPULegalizerInfo::legalizeSinCos(
2166   MachineInstr &MI, MachineRegisterInfo &MRI,
2167   MachineIRBuilder &B) const {
2168 
2169   Register DstReg = MI.getOperand(0).getReg();
2170   Register SrcReg = MI.getOperand(1).getReg();
2171   LLT Ty = MRI.getType(DstReg);
2172   unsigned Flags = MI.getFlags();
2173 
2174   Register TrigVal;
2175   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2176   if (ST.hasTrigReducedRange()) {
2177     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2178     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2179       .addUse(MulVal.getReg(0))
2180       .setMIFlags(Flags).getReg(0);
2181   } else
2182     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2183 
2184   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2185     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2186   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2187     .addUse(TrigVal)
2188     .setMIFlags(Flags);
2189   MI.eraseFromParent();
2190   return true;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2194                                                   MachineIRBuilder &B,
2195                                                   const GlobalValue *GV,
2196                                                   int64_t Offset,
2197                                                   unsigned GAFlags) const {
2198   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2199   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2200   // to the following code sequence:
2201   //
2202   // For constant address space:
2203   //   s_getpc_b64 s[0:1]
2204   //   s_add_u32 s0, s0, $symbol
2205   //   s_addc_u32 s1, s1, 0
2206   //
2207   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2208   //   a fixup or relocation is emitted to replace $symbol with a literal
2209   //   constant, which is a pc-relative offset from the encoding of the $symbol
2210   //   operand to the global variable.
2211   //
2212   // For global address space:
2213   //   s_getpc_b64 s[0:1]
2214   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2215   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2216   //
2217   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2218   //   fixups or relocations are emitted to replace $symbol@*@lo and
2219   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2220   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2221   //   operand to the global variable.
2222   //
2223   // What we want here is an offset from the value returned by s_getpc
2224   // (which is the address of the s_add_u32 instruction) to the global
2225   // variable, but since the encoding of $symbol starts 4 bytes after the start
2226   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2227   // small. This requires us to add 4 to the global variable offset in order to
2228   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2229   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2230   // instruction.
2231 
2232   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2233 
2234   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2235     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2236 
2237   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2238     .addDef(PCReg);
2239 
2240   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2241   if (GAFlags == SIInstrInfo::MO_NONE)
2242     MIB.addImm(0);
2243   else
2244     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2245 
2246   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2247 
2248   if (PtrTy.getSizeInBits() == 32)
2249     B.buildExtract(DstReg, PCReg, 0);
2250   return true;
2251  }
2252 
2253 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2254   MachineInstr &MI, MachineRegisterInfo &MRI,
2255   MachineIRBuilder &B) const {
2256   Register DstReg = MI.getOperand(0).getReg();
2257   LLT Ty = MRI.getType(DstReg);
2258   unsigned AS = Ty.getAddressSpace();
2259 
2260   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2261   MachineFunction &MF = B.getMF();
2262   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2263 
2264   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2265     if (!MFI->isModuleEntryFunction()) {
2266       const Function &Fn = MF.getFunction();
2267       DiagnosticInfoUnsupported BadLDSDecl(
2268         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2269         DS_Warning);
2270       Fn.getContext().diagnose(BadLDSDecl);
2271 
2272       // We currently don't have a way to correctly allocate LDS objects that
2273       // aren't directly associated with a kernel. We do force inlining of
2274       // functions that use local objects. However, if these dead functions are
2275       // not eliminated, we don't want a compile time error. Just emit a warning
2276       // and a trap, since there should be no callable path here.
2277       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2278       B.buildUndef(DstReg);
2279       MI.eraseFromParent();
2280       return true;
2281     }
2282 
2283     // TODO: We could emit code to handle the initialization somewhere.
2284     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2285       const SITargetLowering *TLI = ST.getTargetLowering();
2286       if (!TLI->shouldUseLDSConstAddress(GV)) {
2287         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2288         return true; // Leave in place;
2289       }
2290 
2291       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2292         Type *Ty = GV->getValueType();
2293         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2294         // zero-sized type in other languages to declare the dynamic shared
2295         // memory which size is not known at the compile time. They will be
2296         // allocated by the runtime and placed directly after the static
2297         // allocated ones. They all share the same offset.
2298         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2299           // Adjust alignment for that dynamic shared memory array.
2300           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2301           LLT S32 = LLT::scalar(32);
2302           auto Sz =
2303               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2304           B.buildIntToPtr(DstReg, Sz);
2305           MI.eraseFromParent();
2306           return true;
2307         }
2308       }
2309 
2310       B.buildConstant(
2311           DstReg,
2312           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2313       MI.eraseFromParent();
2314       return true;
2315     }
2316 
2317     const Function &Fn = MF.getFunction();
2318     DiagnosticInfoUnsupported BadInit(
2319       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2320     Fn.getContext().diagnose(BadInit);
2321     return true;
2322   }
2323 
2324   const SITargetLowering *TLI = ST.getTargetLowering();
2325 
2326   if (TLI->shouldEmitFixup(GV)) {
2327     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2328     MI.eraseFromParent();
2329     return true;
2330   }
2331 
2332   if (TLI->shouldEmitPCReloc(GV)) {
2333     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2334     MI.eraseFromParent();
2335     return true;
2336   }
2337 
2338   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2339   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2340 
2341   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2342       MachinePointerInfo::getGOT(MF),
2343       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2344           MachineMemOperand::MOInvariant,
2345       8 /*Size*/, Align(8));
2346 
2347   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2348 
2349   if (Ty.getSizeInBits() == 32) {
2350     // Truncate if this is a 32-bit constant adrdess.
2351     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2352     B.buildExtract(DstReg, Load, 0);
2353   } else
2354     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2355 
2356   MI.eraseFromParent();
2357   return true;
2358 }
2359 
2360 static LLT widenToNextPowerOf2(LLT Ty) {
2361   if (Ty.isVector())
2362     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2363   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2364 }
2365 
2366 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2367                                        MachineInstr &MI) const {
2368   MachineIRBuilder &B = Helper.MIRBuilder;
2369   MachineRegisterInfo &MRI = *B.getMRI();
2370   GISelChangeObserver &Observer = Helper.Observer;
2371 
2372   Register PtrReg = MI.getOperand(1).getReg();
2373   LLT PtrTy = MRI.getType(PtrReg);
2374   unsigned AddrSpace = PtrTy.getAddressSpace();
2375 
2376   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2377     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2378     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2379     Observer.changingInstr(MI);
2380     MI.getOperand(1).setReg(Cast.getReg(0));
2381     Observer.changedInstr(MI);
2382     return true;
2383   }
2384 
2385   Register ValReg = MI.getOperand(0).getReg();
2386   LLT ValTy = MRI.getType(ValReg);
2387 
2388   MachineMemOperand *MMO = *MI.memoperands_begin();
2389   const unsigned ValSize = ValTy.getSizeInBits();
2390   const unsigned MemSize = 8 * MMO->getSize();
2391   const Align MemAlign = MMO->getAlign();
2392   const unsigned AlignInBits = 8 * MemAlign.value();
2393 
2394   // Widen non-power-of-2 loads to the alignment if needed
2395   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2396     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2397 
2398     // This was already the correct extending load result type, so just adjust
2399     // the memory type.
2400     if (WideMemSize == ValSize) {
2401       MachineFunction &MF = B.getMF();
2402 
2403       MachineMemOperand *WideMMO =
2404           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2405       Observer.changingInstr(MI);
2406       MI.setMemRefs(MF, {WideMMO});
2407       Observer.changedInstr(MI);
2408       return true;
2409     }
2410 
2411     // Don't bother handling edge case that should probably never be produced.
2412     if (ValSize > WideMemSize)
2413       return false;
2414 
2415     LLT WideTy = widenToNextPowerOf2(ValTy);
2416 
2417     Register WideLoad;
2418     if (!WideTy.isVector()) {
2419       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2420       B.buildTrunc(ValReg, WideLoad).getReg(0);
2421     } else {
2422       // Extract the subvector.
2423 
2424       if (isRegisterType(ValTy)) {
2425         // If this a case where G_EXTRACT is legal, use it.
2426         // (e.g. <3 x s32> -> <4 x s32>)
2427         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2428         B.buildExtract(ValReg, WideLoad, 0);
2429       } else {
2430         // For cases where the widened type isn't a nice register value, unmerge
2431         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2432         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2433         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2434         B.setInsertPt(B.getMBB(), MI.getIterator());
2435         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2436       }
2437     }
2438 
2439     MI.eraseFromParent();
2440     return true;
2441   }
2442 
2443   return false;
2444 }
2445 
2446 bool AMDGPULegalizerInfo::legalizeFMad(
2447   MachineInstr &MI, MachineRegisterInfo &MRI,
2448   MachineIRBuilder &B) const {
2449   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2450   assert(Ty.isScalar());
2451 
2452   MachineFunction &MF = B.getMF();
2453   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2454 
2455   // TODO: Always legal with future ftz flag.
2456   // FIXME: Do we need just output?
2457   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2458     return true;
2459   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2460     return true;
2461 
2462   MachineIRBuilder HelperBuilder(MI);
2463   GISelObserverWrapper DummyObserver;
2464   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2465   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2466 }
2467 
2468 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2469   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2470   Register DstReg = MI.getOperand(0).getReg();
2471   Register PtrReg = MI.getOperand(1).getReg();
2472   Register CmpVal = MI.getOperand(2).getReg();
2473   Register NewVal = MI.getOperand(3).getReg();
2474 
2475   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2476          "this should not have been custom lowered");
2477 
2478   LLT ValTy = MRI.getType(CmpVal);
2479   LLT VecTy = LLT::vector(2, ValTy);
2480 
2481   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2482 
2483   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2484     .addDef(DstReg)
2485     .addUse(PtrReg)
2486     .addUse(PackedVal)
2487     .setMemRefs(MI.memoperands());
2488 
2489   MI.eraseFromParent();
2490   return true;
2491 }
2492 
2493 bool AMDGPULegalizerInfo::legalizeFlog(
2494   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2495   Register Dst = MI.getOperand(0).getReg();
2496   Register Src = MI.getOperand(1).getReg();
2497   LLT Ty = B.getMRI()->getType(Dst);
2498   unsigned Flags = MI.getFlags();
2499 
2500   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2501   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2502 
2503   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2504   MI.eraseFromParent();
2505   return true;
2506 }
2507 
2508 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2509                                        MachineIRBuilder &B) const {
2510   Register Dst = MI.getOperand(0).getReg();
2511   Register Src = MI.getOperand(1).getReg();
2512   unsigned Flags = MI.getFlags();
2513   LLT Ty = B.getMRI()->getType(Dst);
2514 
2515   auto K = B.buildFConstant(Ty, numbers::log2e);
2516   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2517   B.buildFExp2(Dst, Mul, Flags);
2518   MI.eraseFromParent();
2519   return true;
2520 }
2521 
2522 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2523                                        MachineIRBuilder &B) const {
2524   Register Dst = MI.getOperand(0).getReg();
2525   Register Src0 = MI.getOperand(1).getReg();
2526   Register Src1 = MI.getOperand(2).getReg();
2527   unsigned Flags = MI.getFlags();
2528   LLT Ty = B.getMRI()->getType(Dst);
2529   const LLT S16 = LLT::scalar(16);
2530   const LLT S32 = LLT::scalar(32);
2531 
2532   if (Ty == S32) {
2533     auto Log = B.buildFLog2(S32, Src0, Flags);
2534     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2535       .addUse(Log.getReg(0))
2536       .addUse(Src1)
2537       .setMIFlags(Flags);
2538     B.buildFExp2(Dst, Mul, Flags);
2539   } else if (Ty == S16) {
2540     // There's no f16 fmul_legacy, so we need to convert for it.
2541     auto Log = B.buildFLog2(S16, Src0, Flags);
2542     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2543     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2544     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2545       .addUse(Ext0.getReg(0))
2546       .addUse(Ext1.getReg(0))
2547       .setMIFlags(Flags);
2548 
2549     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2550   } else
2551     return false;
2552 
2553   MI.eraseFromParent();
2554   return true;
2555 }
2556 
2557 // Find a source register, ignoring any possible source modifiers.
2558 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2559   Register ModSrc = OrigSrc;
2560   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2561     ModSrc = SrcFNeg->getOperand(1).getReg();
2562     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2563       ModSrc = SrcFAbs->getOperand(1).getReg();
2564   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2565     ModSrc = SrcFAbs->getOperand(1).getReg();
2566   return ModSrc;
2567 }
2568 
2569 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2570                                          MachineRegisterInfo &MRI,
2571                                          MachineIRBuilder &B) const {
2572 
2573   const LLT S1 = LLT::scalar(1);
2574   const LLT S64 = LLT::scalar(64);
2575   Register Dst = MI.getOperand(0).getReg();
2576   Register OrigSrc = MI.getOperand(1).getReg();
2577   unsigned Flags = MI.getFlags();
2578   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2579          "this should not have been custom lowered");
2580 
2581   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2582   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2583   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2584   // V_FRACT bug is:
2585   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2586   //
2587   // Convert floor(x) to (x - fract(x))
2588 
2589   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2590     .addUse(OrigSrc)
2591     .setMIFlags(Flags);
2592 
2593   // Give source modifier matching some assistance before obscuring a foldable
2594   // pattern.
2595 
2596   // TODO: We can avoid the neg on the fract? The input sign to fract
2597   // shouldn't matter?
2598   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2599 
2600   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2601 
2602   Register Min = MRI.createGenericVirtualRegister(S64);
2603 
2604   // We don't need to concern ourselves with the snan handling difference, so
2605   // use the one which will directly select.
2606   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2607   if (MFI->getMode().IEEE)
2608     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2609   else
2610     B.buildFMinNum(Min, Fract, Const, Flags);
2611 
2612   Register CorrectedFract = Min;
2613   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2614     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2615     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2616   }
2617 
2618   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2619   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2620 
2621   MI.eraseFromParent();
2622   return true;
2623 }
2624 
2625 // Turn an illegal packed v2s16 build vector into bit operations.
2626 // TODO: This should probably be a bitcast action in LegalizerHelper.
2627 bool AMDGPULegalizerInfo::legalizeBuildVector(
2628   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2629   Register Dst = MI.getOperand(0).getReg();
2630   const LLT S32 = LLT::scalar(32);
2631   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2632 
2633   Register Src0 = MI.getOperand(1).getReg();
2634   Register Src1 = MI.getOperand(2).getReg();
2635   assert(MRI.getType(Src0) == LLT::scalar(16));
2636 
2637   auto Merge = B.buildMerge(S32, {Src0, Src1});
2638   B.buildBitcast(Dst, Merge);
2639 
2640   MI.eraseFromParent();
2641   return true;
2642 }
2643 
2644 // Check that this is a G_XOR x, -1
2645 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2646   if (MI.getOpcode() != TargetOpcode::G_XOR)
2647     return false;
2648   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2649   return ConstVal && *ConstVal == -1;
2650 }
2651 
2652 // Return the use branch instruction, otherwise null if the usage is invalid.
2653 static MachineInstr *
2654 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2655                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2656   Register CondDef = MI.getOperand(0).getReg();
2657   if (!MRI.hasOneNonDBGUse(CondDef))
2658     return nullptr;
2659 
2660   MachineBasicBlock *Parent = MI.getParent();
2661   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2662 
2663   if (isNot(MRI, *UseMI)) {
2664     Register NegatedCond = UseMI->getOperand(0).getReg();
2665     if (!MRI.hasOneNonDBGUse(NegatedCond))
2666       return nullptr;
2667 
2668     // We're deleting the def of this value, so we need to remove it.
2669     UseMI->eraseFromParent();
2670 
2671     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2672     Negated = true;
2673   }
2674 
2675   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2676     return nullptr;
2677 
2678   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2679   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2680   if (Next == Parent->end()) {
2681     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2682     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2683       return nullptr;
2684     UncondBrTarget = &*NextMBB;
2685   } else {
2686     if (Next->getOpcode() != AMDGPU::G_BR)
2687       return nullptr;
2688     Br = &*Next;
2689     UncondBrTarget = Br->getOperand(0).getMBB();
2690   }
2691 
2692   return UseMI;
2693 }
2694 
2695 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2696                                          const ArgDescriptor *Arg,
2697                                          const TargetRegisterClass *ArgRC,
2698                                          LLT ArgTy) const {
2699   MCRegister SrcReg = Arg->getRegister();
2700   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2701   assert(DstReg.isVirtual() && "Virtual register expected");
2702 
2703   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2704                                              ArgTy);
2705   if (Arg->isMasked()) {
2706     // TODO: Should we try to emit this once in the entry block?
2707     const LLT S32 = LLT::scalar(32);
2708     const unsigned Mask = Arg->getMask();
2709     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2710 
2711     Register AndMaskSrc = LiveIn;
2712 
2713     if (Shift != 0) {
2714       auto ShiftAmt = B.buildConstant(S32, Shift);
2715       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2716     }
2717 
2718     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2719   } else {
2720     B.buildCopy(DstReg, LiveIn);
2721   }
2722 
2723   return true;
2724 }
2725 
2726 bool AMDGPULegalizerInfo::loadInputValue(
2727     Register DstReg, MachineIRBuilder &B,
2728     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2729   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2730   const ArgDescriptor *Arg;
2731   const TargetRegisterClass *ArgRC;
2732   LLT ArgTy;
2733   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2734 
2735   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2736     return false; // TODO: Handle these
2737   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2738 }
2739 
2740 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2741     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2742     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2743   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2744     return false;
2745 
2746   MI.eraseFromParent();
2747   return true;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2751                                        MachineRegisterInfo &MRI,
2752                                        MachineIRBuilder &B) const {
2753   Register Dst = MI.getOperand(0).getReg();
2754   LLT DstTy = MRI.getType(Dst);
2755   LLT S16 = LLT::scalar(16);
2756   LLT S32 = LLT::scalar(32);
2757   LLT S64 = LLT::scalar(64);
2758 
2759   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2760     return true;
2761 
2762   if (DstTy == S16)
2763     return legalizeFDIV16(MI, MRI, B);
2764   if (DstTy == S32)
2765     return legalizeFDIV32(MI, MRI, B);
2766   if (DstTy == S64)
2767     return legalizeFDIV64(MI, MRI, B);
2768 
2769   return false;
2770 }
2771 
2772 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2773                                                   Register DstReg,
2774                                                   Register X,
2775                                                   Register Y,
2776                                                   bool IsDiv) const {
2777   const LLT S1 = LLT::scalar(1);
2778   const LLT S32 = LLT::scalar(32);
2779 
2780   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2781   // algorithm used here.
2782 
2783   // Initial estimate of inv(y).
2784   auto FloatY = B.buildUITOFP(S32, Y);
2785   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2786   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2787   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2788   auto Z = B.buildFPTOUI(S32, ScaledY);
2789 
2790   // One round of UNR.
2791   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2792   auto NegYZ = B.buildMul(S32, NegY, Z);
2793   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2794 
2795   // Quotient/remainder estimate.
2796   auto Q = B.buildUMulH(S32, X, Z);
2797   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2798 
2799   // First quotient/remainder refinement.
2800   auto One = B.buildConstant(S32, 1);
2801   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2802   if (IsDiv)
2803     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2804   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2805 
2806   // Second quotient/remainder refinement.
2807   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2808   if (IsDiv)
2809     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2810   else
2811     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2812 }
2813 
2814 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2815                                               MachineRegisterInfo &MRI,
2816                                               MachineIRBuilder &B) const {
2817   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2818   Register DstReg = MI.getOperand(0).getReg();
2819   Register Num = MI.getOperand(1).getReg();
2820   Register Den = MI.getOperand(2).getReg();
2821   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2822   MI.eraseFromParent();
2823   return true;
2824 }
2825 
2826 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2827 //
2828 // Return lo, hi of result
2829 //
2830 // %cvt.lo = G_UITOFP Val.lo
2831 // %cvt.hi = G_UITOFP Val.hi
2832 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2833 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2834 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2835 // %mul2 = G_FMUL %mul1, 2**(-32)
2836 // %trunc = G_INTRINSIC_TRUNC %mul2
2837 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2838 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2839 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2840                                                        Register Val) {
2841   const LLT S32 = LLT::scalar(32);
2842   auto Unmerge = B.buildUnmerge(S32, Val);
2843 
2844   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2845   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2846 
2847   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2848                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2849 
2850   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2851   auto Mul1 =
2852       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2853 
2854   // 2**(-32)
2855   auto Mul2 =
2856       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2857   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2858 
2859   // -(2**32)
2860   auto Mad2 = B.buildFMAD(S32, Trunc,
2861                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2862 
2863   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2864   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2865 
2866   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2867 }
2868 
2869 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2870                                                   Register DstReg,
2871                                                   Register Numer,
2872                                                   Register Denom,
2873                                                   bool IsDiv) const {
2874   const LLT S32 = LLT::scalar(32);
2875   const LLT S64 = LLT::scalar(64);
2876   const LLT S1 = LLT::scalar(1);
2877   Register RcpLo, RcpHi;
2878 
2879   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2880 
2881   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2882 
2883   auto Zero64 = B.buildConstant(S64, 0);
2884   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2885 
2886   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2887   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2888 
2889   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2890   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2891   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2892 
2893   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2894   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2895   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2896   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2897 
2898   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2899   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2900   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2901   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2902   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2903 
2904   auto Zero32 = B.buildConstant(S32, 0);
2905   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2906   auto Add2_HiC =
2907       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2908   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2909   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2910 
2911   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2912   Register NumerLo = UnmergeNumer.getReg(0);
2913   Register NumerHi = UnmergeNumer.getReg(1);
2914 
2915   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2916   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2917   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2918   Register Mul3_Lo = UnmergeMul3.getReg(0);
2919   Register Mul3_Hi = UnmergeMul3.getReg(1);
2920   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2921   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2922   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2923   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2924 
2925   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2926   Register DenomLo = UnmergeDenom.getReg(0);
2927   Register DenomHi = UnmergeDenom.getReg(1);
2928 
2929   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2930   auto C1 = B.buildSExt(S32, CmpHi);
2931 
2932   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2933   auto C2 = B.buildSExt(S32, CmpLo);
2934 
2935   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2936   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2937 
2938   // TODO: Here and below portions of the code can be enclosed into if/endif.
2939   // Currently control flow is unconditional and we have 4 selects after
2940   // potential endif to substitute PHIs.
2941 
2942   // if C3 != 0 ...
2943   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2944   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2945   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2946   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2947 
2948   auto One64 = B.buildConstant(S64, 1);
2949   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2950 
2951   auto C4 =
2952       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2953   auto C5 =
2954       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2955   auto C6 = B.buildSelect(
2956       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2957 
2958   // if (C6 != 0)
2959   auto Add4 = B.buildAdd(S64, Add3, One64);
2960   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2961 
2962   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2963   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2964   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2965 
2966   // endif C6
2967   // endif C3
2968 
2969   if (IsDiv) {
2970     auto Sel1 = B.buildSelect(
2971         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2972     B.buildSelect(DstReg,
2973                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2974   } else {
2975     auto Sel2 = B.buildSelect(
2976         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2977     B.buildSelect(DstReg,
2978                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2979   }
2980 }
2981 
2982 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2983                                             MachineRegisterInfo &MRI,
2984                                             MachineIRBuilder &B) const {
2985   const LLT S64 = LLT::scalar(64);
2986   const LLT S32 = LLT::scalar(32);
2987   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2988   Register DstReg = MI.getOperand(0).getReg();
2989   Register Num = MI.getOperand(1).getReg();
2990   Register Den = MI.getOperand(2).getReg();
2991   LLT Ty = MRI.getType(DstReg);
2992 
2993   if (Ty == S32)
2994     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2995   else if (Ty == S64)
2996     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2997   else
2998     return false;
2999 
3000   MI.eraseFromParent();
3001   return true;
3002 
3003 }
3004 
3005 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3006                                             MachineRegisterInfo &MRI,
3007                                             MachineIRBuilder &B) const {
3008   const LLT S64 = LLT::scalar(64);
3009   const LLT S32 = LLT::scalar(32);
3010 
3011   Register DstReg = MI.getOperand(0).getReg();
3012   const LLT Ty = MRI.getType(DstReg);
3013   if (Ty != S32 && Ty != S64)
3014     return false;
3015 
3016   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3017 
3018   Register LHS = MI.getOperand(1).getReg();
3019   Register RHS = MI.getOperand(2).getReg();
3020 
3021   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3022   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3023   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3024 
3025   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3026   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3027 
3028   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3029   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3030 
3031   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3032   if (Ty == S32)
3033     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3034   else
3035     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3036 
3037   Register Sign;
3038   if (IsDiv)
3039     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3040   else
3041     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3042 
3043   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3044   B.buildSub(DstReg, UDivRem, Sign);
3045 
3046   MI.eraseFromParent();
3047   return true;
3048 }
3049 
3050 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3051                                                  MachineRegisterInfo &MRI,
3052                                                  MachineIRBuilder &B) const {
3053   Register Res = MI.getOperand(0).getReg();
3054   Register LHS = MI.getOperand(1).getReg();
3055   Register RHS = MI.getOperand(2).getReg();
3056 
3057   uint16_t Flags = MI.getFlags();
3058 
3059   LLT ResTy = MRI.getType(Res);
3060   LLT S32 = LLT::scalar(32);
3061   LLT S64 = LLT::scalar(64);
3062 
3063   const MachineFunction &MF = B.getMF();
3064   bool Unsafe =
3065     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3066 
3067   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3068     return false;
3069 
3070   if (!Unsafe && ResTy == S32 &&
3071       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3072     return false;
3073 
3074   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3075     // 1 / x -> RCP(x)
3076     if (CLHS->isExactlyValue(1.0)) {
3077       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3078         .addUse(RHS)
3079         .setMIFlags(Flags);
3080 
3081       MI.eraseFromParent();
3082       return true;
3083     }
3084 
3085     // -1 / x -> RCP( FNEG(x) )
3086     if (CLHS->isExactlyValue(-1.0)) {
3087       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3088       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3089         .addUse(FNeg.getReg(0))
3090         .setMIFlags(Flags);
3091 
3092       MI.eraseFromParent();
3093       return true;
3094     }
3095   }
3096 
3097   // x / y -> x * (1.0 / y)
3098   if (Unsafe) {
3099     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3100       .addUse(RHS)
3101       .setMIFlags(Flags);
3102     B.buildFMul(Res, LHS, RCP, Flags);
3103 
3104     MI.eraseFromParent();
3105     return true;
3106   }
3107 
3108   return false;
3109 }
3110 
3111 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3112                                          MachineRegisterInfo &MRI,
3113                                          MachineIRBuilder &B) const {
3114   Register Res = MI.getOperand(0).getReg();
3115   Register LHS = MI.getOperand(1).getReg();
3116   Register RHS = MI.getOperand(2).getReg();
3117 
3118   uint16_t Flags = MI.getFlags();
3119 
3120   LLT S16 = LLT::scalar(16);
3121   LLT S32 = LLT::scalar(32);
3122 
3123   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3124   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3125 
3126   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3127     .addUse(RHSExt.getReg(0))
3128     .setMIFlags(Flags);
3129 
3130   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3131   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3132 
3133   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3134     .addUse(RDst.getReg(0))
3135     .addUse(RHS)
3136     .addUse(LHS)
3137     .setMIFlags(Flags);
3138 
3139   MI.eraseFromParent();
3140   return true;
3141 }
3142 
3143 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3144 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3145 static void toggleSPDenormMode(bool Enable,
3146                                MachineIRBuilder &B,
3147                                const GCNSubtarget &ST,
3148                                AMDGPU::SIModeRegisterDefaults Mode) {
3149   // Set SP denorm mode to this value.
3150   unsigned SPDenormMode =
3151     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3152 
3153   if (ST.hasDenormModeInst()) {
3154     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3155     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3156 
3157     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3158     B.buildInstr(AMDGPU::S_DENORM_MODE)
3159       .addImm(NewDenormModeValue);
3160 
3161   } else {
3162     // Select FP32 bit field in mode register.
3163     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3164                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3165                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3166 
3167     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3168       .addImm(SPDenormMode)
3169       .addImm(SPDenormModeBitField);
3170   }
3171 }
3172 
3173 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3174                                          MachineRegisterInfo &MRI,
3175                                          MachineIRBuilder &B) const {
3176   Register Res = MI.getOperand(0).getReg();
3177   Register LHS = MI.getOperand(1).getReg();
3178   Register RHS = MI.getOperand(2).getReg();
3179   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3180   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3181 
3182   uint16_t Flags = MI.getFlags();
3183 
3184   LLT S32 = LLT::scalar(32);
3185   LLT S1 = LLT::scalar(1);
3186 
3187   auto One = B.buildFConstant(S32, 1.0f);
3188 
3189   auto DenominatorScaled =
3190     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3191       .addUse(LHS)
3192       .addUse(RHS)
3193       .addImm(0)
3194       .setMIFlags(Flags);
3195   auto NumeratorScaled =
3196     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3197       .addUse(LHS)
3198       .addUse(RHS)
3199       .addImm(1)
3200       .setMIFlags(Flags);
3201 
3202   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3203     .addUse(DenominatorScaled.getReg(0))
3204     .setMIFlags(Flags);
3205   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3206 
3207   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3208   // aren't modeled as reading it.
3209   if (!Mode.allFP32Denormals())
3210     toggleSPDenormMode(true, B, ST, Mode);
3211 
3212   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3213   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3214   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3215   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3216   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3217   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3218 
3219   if (!Mode.allFP32Denormals())
3220     toggleSPDenormMode(false, B, ST, Mode);
3221 
3222   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3223     .addUse(Fma4.getReg(0))
3224     .addUse(Fma1.getReg(0))
3225     .addUse(Fma3.getReg(0))
3226     .addUse(NumeratorScaled.getReg(1))
3227     .setMIFlags(Flags);
3228 
3229   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3230     .addUse(Fmas.getReg(0))
3231     .addUse(RHS)
3232     .addUse(LHS)
3233     .setMIFlags(Flags);
3234 
3235   MI.eraseFromParent();
3236   return true;
3237 }
3238 
3239 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3240                                          MachineRegisterInfo &MRI,
3241                                          MachineIRBuilder &B) const {
3242   Register Res = MI.getOperand(0).getReg();
3243   Register LHS = MI.getOperand(1).getReg();
3244   Register RHS = MI.getOperand(2).getReg();
3245 
3246   uint16_t Flags = MI.getFlags();
3247 
3248   LLT S64 = LLT::scalar(64);
3249   LLT S1 = LLT::scalar(1);
3250 
3251   auto One = B.buildFConstant(S64, 1.0);
3252 
3253   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3254     .addUse(LHS)
3255     .addUse(RHS)
3256     .addImm(0)
3257     .setMIFlags(Flags);
3258 
3259   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3260 
3261   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3262     .addUse(DivScale0.getReg(0))
3263     .setMIFlags(Flags);
3264 
3265   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3266   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3267   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3268 
3269   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3270     .addUse(LHS)
3271     .addUse(RHS)
3272     .addImm(1)
3273     .setMIFlags(Flags);
3274 
3275   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3276   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3277   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3278 
3279   Register Scale;
3280   if (!ST.hasUsableDivScaleConditionOutput()) {
3281     // Workaround a hardware bug on SI where the condition output from div_scale
3282     // is not usable.
3283 
3284     LLT S32 = LLT::scalar(32);
3285 
3286     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3287     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3288     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3289     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3290 
3291     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3292                               Scale1Unmerge.getReg(1));
3293     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3294                               Scale0Unmerge.getReg(1));
3295     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3296   } else {
3297     Scale = DivScale1.getReg(1);
3298   }
3299 
3300   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3301     .addUse(Fma4.getReg(0))
3302     .addUse(Fma3.getReg(0))
3303     .addUse(Mul.getReg(0))
3304     .addUse(Scale)
3305     .setMIFlags(Flags);
3306 
3307   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3308     .addUse(Fmas.getReg(0))
3309     .addUse(RHS)
3310     .addUse(LHS)
3311     .setMIFlags(Flags);
3312 
3313   MI.eraseFromParent();
3314   return true;
3315 }
3316 
3317 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3318                                                  MachineRegisterInfo &MRI,
3319                                                  MachineIRBuilder &B) const {
3320   Register Res = MI.getOperand(0).getReg();
3321   Register LHS = MI.getOperand(2).getReg();
3322   Register RHS = MI.getOperand(3).getReg();
3323   uint16_t Flags = MI.getFlags();
3324 
3325   LLT S32 = LLT::scalar(32);
3326   LLT S1 = LLT::scalar(1);
3327 
3328   auto Abs = B.buildFAbs(S32, RHS, Flags);
3329   const APFloat C0Val(1.0f);
3330 
3331   auto C0 = B.buildConstant(S32, 0x6f800000);
3332   auto C1 = B.buildConstant(S32, 0x2f800000);
3333   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3334 
3335   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3336   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3337 
3338   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3339 
3340   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3341     .addUse(Mul0.getReg(0))
3342     .setMIFlags(Flags);
3343 
3344   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3345 
3346   B.buildFMul(Res, Sel, Mul1, Flags);
3347 
3348   MI.eraseFromParent();
3349   return true;
3350 }
3351 
3352 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3353 // FIXME: Why do we handle this one but not other removed instructions?
3354 //
3355 // Reciprocal square root.  The clamp prevents infinite results, clamping
3356 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3357 // +-max_float.
3358 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3359                                                     MachineRegisterInfo &MRI,
3360                                                     MachineIRBuilder &B) const {
3361   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3362     return true;
3363 
3364   Register Dst = MI.getOperand(0).getReg();
3365   Register Src = MI.getOperand(2).getReg();
3366   auto Flags = MI.getFlags();
3367 
3368   LLT Ty = MRI.getType(Dst);
3369 
3370   const fltSemantics *FltSemantics;
3371   if (Ty == LLT::scalar(32))
3372     FltSemantics = &APFloat::IEEEsingle();
3373   else if (Ty == LLT::scalar(64))
3374     FltSemantics = &APFloat::IEEEdouble();
3375   else
3376     return false;
3377 
3378   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3379     .addUse(Src)
3380     .setMIFlags(Flags);
3381 
3382   // We don't need to concern ourselves with the snan handling difference, since
3383   // the rsq quieted (or not) so use the one which will directly select.
3384   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3385   const bool UseIEEE = MFI->getMode().IEEE;
3386 
3387   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3388   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3389                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3390 
3391   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3392 
3393   if (UseIEEE)
3394     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3395   else
3396     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3397   MI.eraseFromParent();
3398   return true;
3399 }
3400 
3401 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3402   switch (IID) {
3403   case Intrinsic::amdgcn_ds_fadd:
3404     return AMDGPU::G_ATOMICRMW_FADD;
3405   case Intrinsic::amdgcn_ds_fmin:
3406     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3407   case Intrinsic::amdgcn_ds_fmax:
3408     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3409   default:
3410     llvm_unreachable("not a DS FP intrinsic");
3411   }
3412 }
3413 
3414 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3415                                                       MachineInstr &MI,
3416                                                       Intrinsic::ID IID) const {
3417   GISelChangeObserver &Observer = Helper.Observer;
3418   Observer.changingInstr(MI);
3419 
3420   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3421 
3422   // The remaining operands were used to set fields in the MemOperand on
3423   // construction.
3424   for (int I = 6; I > 3; --I)
3425     MI.RemoveOperand(I);
3426 
3427   MI.RemoveOperand(1); // Remove the intrinsic ID.
3428   Observer.changedInstr(MI);
3429   return true;
3430 }
3431 
3432 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3433                                             MachineRegisterInfo &MRI,
3434                                             MachineIRBuilder &B) const {
3435   uint64_t Offset =
3436     ST.getTargetLowering()->getImplicitParameterOffset(
3437       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3438   LLT DstTy = MRI.getType(DstReg);
3439   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3440 
3441   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3442   if (!loadInputValue(KernargPtrReg, B,
3443                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3444     return false;
3445 
3446   // FIXME: This should be nuw
3447   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3448   return true;
3449 }
3450 
3451 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3452                                                  MachineRegisterInfo &MRI,
3453                                                  MachineIRBuilder &B) const {
3454   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3455   if (!MFI->isEntryFunction()) {
3456     return legalizePreloadedArgIntrin(MI, MRI, B,
3457                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3458   }
3459 
3460   Register DstReg = MI.getOperand(0).getReg();
3461   if (!getImplicitArgPtr(DstReg, MRI, B))
3462     return false;
3463 
3464   MI.eraseFromParent();
3465   return true;
3466 }
3467 
3468 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3469                                               MachineRegisterInfo &MRI,
3470                                               MachineIRBuilder &B,
3471                                               unsigned AddrSpace) const {
3472   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3473   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3474   Register Hi32 = Unmerge.getReg(1);
3475 
3476   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3477   MI.eraseFromParent();
3478   return true;
3479 }
3480 
3481 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3482 // offset (the offset that is included in bounds checking and swizzling, to be
3483 // split between the instruction's voffset and immoffset fields) and soffset
3484 // (the offset that is excluded from bounds checking and swizzling, to go in
3485 // the instruction's soffset field).  This function takes the first kind of
3486 // offset and figures out how to split it between voffset and immoffset.
3487 std::tuple<Register, unsigned, unsigned>
3488 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3489                                         Register OrigOffset) const {
3490   const unsigned MaxImm = 4095;
3491   Register BaseReg;
3492   unsigned TotalConstOffset;
3493   const LLT S32 = LLT::scalar(32);
3494 
3495   std::tie(BaseReg, TotalConstOffset) =
3496       AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3497 
3498   unsigned ImmOffset = TotalConstOffset;
3499 
3500   // If the immediate value is too big for the immoffset field, put the value
3501   // and -4096 into the immoffset field so that the value that is copied/added
3502   // for the voffset field is a multiple of 4096, and it stands more chance
3503   // of being CSEd with the copy/add for another similar load/store.
3504   // However, do not do that rounding down to a multiple of 4096 if that is a
3505   // negative number, as it appears to be illegal to have a negative offset
3506   // in the vgpr, even if adding the immediate offset makes it positive.
3507   unsigned Overflow = ImmOffset & ~MaxImm;
3508   ImmOffset -= Overflow;
3509   if ((int32_t)Overflow < 0) {
3510     Overflow += ImmOffset;
3511     ImmOffset = 0;
3512   }
3513 
3514   if (Overflow != 0) {
3515     if (!BaseReg) {
3516       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3517     } else {
3518       auto OverflowVal = B.buildConstant(S32, Overflow);
3519       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3520     }
3521   }
3522 
3523   if (!BaseReg)
3524     BaseReg = B.buildConstant(S32, 0).getReg(0);
3525 
3526   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3527 }
3528 
3529 /// Handle register layout difference for f16 images for some subtargets.
3530 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3531                                              MachineRegisterInfo &MRI,
3532                                              Register Reg,
3533                                              bool ImageStore) const {
3534   const LLT S16 = LLT::scalar(16);
3535   const LLT S32 = LLT::scalar(32);
3536   LLT StoreVT = MRI.getType(Reg);
3537   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3538 
3539   if (ST.hasUnpackedD16VMem()) {
3540     auto Unmerge = B.buildUnmerge(S16, Reg);
3541 
3542     SmallVector<Register, 4> WideRegs;
3543     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3544       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3545 
3546     int NumElts = StoreVT.getNumElements();
3547 
3548     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3549   }
3550 
3551   if (ImageStore && ST.hasImageStoreD16Bug()) {
3552     if (StoreVT.getNumElements() == 2) {
3553       SmallVector<Register, 4> PackedRegs;
3554       Reg = B.buildBitcast(S32, Reg).getReg(0);
3555       PackedRegs.push_back(Reg);
3556       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3557       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3558     }
3559 
3560     if (StoreVT.getNumElements() == 3) {
3561       SmallVector<Register, 4> PackedRegs;
3562       auto Unmerge = B.buildUnmerge(S16, Reg);
3563       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3564         PackedRegs.push_back(Unmerge.getReg(I));
3565       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3566       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3567       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3568     }
3569 
3570     if (StoreVT.getNumElements() == 4) {
3571       SmallVector<Register, 4> PackedRegs;
3572       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3573       auto Unmerge = B.buildUnmerge(S32, Reg);
3574       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3575         PackedRegs.push_back(Unmerge.getReg(I));
3576       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3577       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3578     }
3579 
3580     llvm_unreachable("invalid data type");
3581   }
3582 
3583   return Reg;
3584 }
3585 
3586 Register AMDGPULegalizerInfo::fixStoreSourceType(
3587   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3588   MachineRegisterInfo *MRI = B.getMRI();
3589   LLT Ty = MRI->getType(VData);
3590 
3591   const LLT S16 = LLT::scalar(16);
3592 
3593   // Fixup illegal register types for i8 stores.
3594   if (Ty == LLT::scalar(8) || Ty == S16) {
3595     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3596     return AnyExt;
3597   }
3598 
3599   if (Ty.isVector()) {
3600     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3601       if (IsFormat)
3602         return handleD16VData(B, *MRI, VData);
3603     }
3604   }
3605 
3606   return VData;
3607 }
3608 
3609 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3610                                               MachineRegisterInfo &MRI,
3611                                               MachineIRBuilder &B,
3612                                               bool IsTyped,
3613                                               bool IsFormat) const {
3614   Register VData = MI.getOperand(1).getReg();
3615   LLT Ty = MRI.getType(VData);
3616   LLT EltTy = Ty.getScalarType();
3617   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3618   const LLT S32 = LLT::scalar(32);
3619 
3620   VData = fixStoreSourceType(B, VData, IsFormat);
3621   Register RSrc = MI.getOperand(2).getReg();
3622 
3623   MachineMemOperand *MMO = *MI.memoperands_begin();
3624   const int MemSize = MMO->getSize();
3625 
3626   unsigned ImmOffset;
3627   unsigned TotalOffset;
3628 
3629   // The typed intrinsics add an immediate after the registers.
3630   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3631 
3632   // The struct intrinsic variants add one additional operand over raw.
3633   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3634   Register VIndex;
3635   int OpOffset = 0;
3636   if (HasVIndex) {
3637     VIndex = MI.getOperand(3).getReg();
3638     OpOffset = 1;
3639   }
3640 
3641   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3642   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3643 
3644   unsigned Format = 0;
3645   if (IsTyped) {
3646     Format = MI.getOperand(5 + OpOffset).getImm();
3647     ++OpOffset;
3648   }
3649 
3650   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3651 
3652   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3653   if (TotalOffset != 0)
3654     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3655 
3656   unsigned Opc;
3657   if (IsTyped) {
3658     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3659                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3660   } else if (IsFormat) {
3661     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3662                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3663   } else {
3664     switch (MemSize) {
3665     case 1:
3666       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3667       break;
3668     case 2:
3669       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3670       break;
3671     default:
3672       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3673       break;
3674     }
3675   }
3676 
3677   if (!VIndex)
3678     VIndex = B.buildConstant(S32, 0).getReg(0);
3679 
3680   auto MIB = B.buildInstr(Opc)
3681     .addUse(VData)              // vdata
3682     .addUse(RSrc)               // rsrc
3683     .addUse(VIndex)             // vindex
3684     .addUse(VOffset)            // voffset
3685     .addUse(SOffset)            // soffset
3686     .addImm(ImmOffset);         // offset(imm)
3687 
3688   if (IsTyped)
3689     MIB.addImm(Format);
3690 
3691   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3692      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3693      .addMemOperand(MMO);
3694 
3695   MI.eraseFromParent();
3696   return true;
3697 }
3698 
3699 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3700                                              MachineRegisterInfo &MRI,
3701                                              MachineIRBuilder &B,
3702                                              bool IsFormat,
3703                                              bool IsTyped) const {
3704   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3705   MachineMemOperand *MMO = *MI.memoperands_begin();
3706   const int MemSize = MMO->getSize();
3707   const LLT S32 = LLT::scalar(32);
3708 
3709   Register Dst = MI.getOperand(0).getReg();
3710   Register RSrc = MI.getOperand(2).getReg();
3711 
3712   // The typed intrinsics add an immediate after the registers.
3713   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3714 
3715   // The struct intrinsic variants add one additional operand over raw.
3716   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3717   Register VIndex;
3718   int OpOffset = 0;
3719   if (HasVIndex) {
3720     VIndex = MI.getOperand(3).getReg();
3721     OpOffset = 1;
3722   }
3723 
3724   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3725   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3726 
3727   unsigned Format = 0;
3728   if (IsTyped) {
3729     Format = MI.getOperand(5 + OpOffset).getImm();
3730     ++OpOffset;
3731   }
3732 
3733   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3734   unsigned ImmOffset;
3735   unsigned TotalOffset;
3736 
3737   LLT Ty = MRI.getType(Dst);
3738   LLT EltTy = Ty.getScalarType();
3739   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3740   const bool Unpacked = ST.hasUnpackedD16VMem();
3741 
3742   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3743   if (TotalOffset != 0)
3744     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3745 
3746   unsigned Opc;
3747 
3748   if (IsTyped) {
3749     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3750                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3751   } else if (IsFormat) {
3752     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3753                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3754   } else {
3755     switch (MemSize) {
3756     case 1:
3757       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3758       break;
3759     case 2:
3760       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3761       break;
3762     default:
3763       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3764       break;
3765     }
3766   }
3767 
3768   Register LoadDstReg;
3769 
3770   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3771   LLT UnpackedTy = Ty.changeElementSize(32);
3772 
3773   if (IsExtLoad)
3774     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3775   else if (Unpacked && IsD16 && Ty.isVector())
3776     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3777   else
3778     LoadDstReg = Dst;
3779 
3780   if (!VIndex)
3781     VIndex = B.buildConstant(S32, 0).getReg(0);
3782 
3783   auto MIB = B.buildInstr(Opc)
3784     .addDef(LoadDstReg)         // vdata
3785     .addUse(RSrc)               // rsrc
3786     .addUse(VIndex)             // vindex
3787     .addUse(VOffset)            // voffset
3788     .addUse(SOffset)            // soffset
3789     .addImm(ImmOffset);         // offset(imm)
3790 
3791   if (IsTyped)
3792     MIB.addImm(Format);
3793 
3794   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3795      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3796      .addMemOperand(MMO);
3797 
3798   if (LoadDstReg != Dst) {
3799     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3800 
3801     // Widen result for extending loads was widened.
3802     if (IsExtLoad)
3803       B.buildTrunc(Dst, LoadDstReg);
3804     else {
3805       // Repack to original 16-bit vector result
3806       // FIXME: G_TRUNC should work, but legalization currently fails
3807       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3808       SmallVector<Register, 4> Repack;
3809       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3810         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3811       B.buildMerge(Dst, Repack);
3812     }
3813   }
3814 
3815   MI.eraseFromParent();
3816   return true;
3817 }
3818 
3819 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3820                                                MachineIRBuilder &B,
3821                                                bool IsInc) const {
3822   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3823                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3824   B.buildInstr(Opc)
3825     .addDef(MI.getOperand(0).getReg())
3826     .addUse(MI.getOperand(2).getReg())
3827     .addUse(MI.getOperand(3).getReg())
3828     .cloneMemRefs(MI);
3829   MI.eraseFromParent();
3830   return true;
3831 }
3832 
3833 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3834   switch (IntrID) {
3835   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3836   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3837     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3838   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3839   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3840     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3841   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3842   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3843     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3844   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3845   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3846     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3847   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3848   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3849     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3850   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3851   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3852     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3853   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3854   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3855     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3856   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3857   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3858     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3859   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3860   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3861     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3862   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3863   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3864     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3865   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3866   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3867     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3868   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3869   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3870     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3871   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3872   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3873     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3874   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3875   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3876     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3877   default:
3878     llvm_unreachable("unhandled atomic opcode");
3879   }
3880 }
3881 
3882 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3883                                                MachineIRBuilder &B,
3884                                                Intrinsic::ID IID) const {
3885   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3886                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3887   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3888 
3889   Register Dst;
3890 
3891   int OpOffset = 0;
3892   if (HasReturn) {
3893     // A few FP atomics do not support return values.
3894     Dst = MI.getOperand(0).getReg();
3895   } else {
3896     OpOffset = -1;
3897   }
3898 
3899   Register VData = MI.getOperand(2 + OpOffset).getReg();
3900   Register CmpVal;
3901 
3902   if (IsCmpSwap) {
3903     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3904     ++OpOffset;
3905   }
3906 
3907   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3908   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3909 
3910   // The struct intrinsic variants add one additional operand over raw.
3911   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3912   Register VIndex;
3913   if (HasVIndex) {
3914     VIndex = MI.getOperand(4 + OpOffset).getReg();
3915     ++OpOffset;
3916   }
3917 
3918   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3919   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3920   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3921 
3922   MachineMemOperand *MMO = *MI.memoperands_begin();
3923 
3924   unsigned ImmOffset;
3925   unsigned TotalOffset;
3926   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3927   if (TotalOffset != 0)
3928     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3929 
3930   if (!VIndex)
3931     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3932 
3933   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3934 
3935   if (HasReturn)
3936     MIB.addDef(Dst);
3937 
3938   MIB.addUse(VData); // vdata
3939 
3940   if (IsCmpSwap)
3941     MIB.addReg(CmpVal);
3942 
3943   MIB.addUse(RSrc)               // rsrc
3944      .addUse(VIndex)             // vindex
3945      .addUse(VOffset)            // voffset
3946      .addUse(SOffset)            // soffset
3947      .addImm(ImmOffset)          // offset(imm)
3948      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3949      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3950      .addMemOperand(MMO);
3951 
3952   MI.eraseFromParent();
3953   return true;
3954 }
3955 
3956 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3957 /// vector with s16 typed elements.
3958 static void packImageA16AddressToDwords(
3959     MachineIRBuilder &B, MachineInstr &MI,
3960     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3961     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3962   const LLT S16 = LLT::scalar(16);
3963   const LLT V2S16 = LLT::vector(2, 16);
3964 
3965   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3966     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3967     if (!SrcOp.isReg())
3968       continue; // _L to _LZ may have eliminated this.
3969 
3970     Register AddrReg = SrcOp.getReg();
3971 
3972     if (I < Intr->GradientStart) {
3973       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3974       PackedAddrs.push_back(AddrReg);
3975     } else {
3976       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3977       // derivatives dx/dh and dx/dv are packed with undef.
3978       if (((I + 1) >= EndIdx) ||
3979           ((Intr->NumGradients / 2) % 2 == 1 &&
3980            (I == static_cast<unsigned>(Intr->GradientStart +
3981                                        (Intr->NumGradients / 2) - 1) ||
3982             I == static_cast<unsigned>(Intr->GradientStart +
3983                                        Intr->NumGradients - 1))) ||
3984           // Check for _L to _LZ optimization
3985           !MI.getOperand(ArgOffset + I + 1).isReg()) {
3986         PackedAddrs.push_back(
3987             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3988                 .getReg(0));
3989       } else {
3990         PackedAddrs.push_back(
3991             B.buildBuildVector(
3992                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3993                 .getReg(0));
3994         ++I;
3995       }
3996     }
3997   }
3998 }
3999 
4000 /// Convert from separate vaddr components to a single vector address register,
4001 /// and replace the remaining operands with $noreg.
4002 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4003                                      int DimIdx, int NumVAddrs) {
4004   const LLT S32 = LLT::scalar(32);
4005 
4006   SmallVector<Register, 8> AddrRegs;
4007   for (int I = 0; I != NumVAddrs; ++I) {
4008     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4009     if (SrcOp.isReg()) {
4010       AddrRegs.push_back(SrcOp.getReg());
4011       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4012     }
4013   }
4014 
4015   int NumAddrRegs = AddrRegs.size();
4016   if (NumAddrRegs != 1) {
4017     // Round up to 8 elements for v5-v7
4018     // FIXME: Missing intermediate sized register classes and instructions.
4019     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4020       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4021       auto Undef = B.buildUndef(S32);
4022       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4023       NumAddrRegs = RoundedNumRegs;
4024     }
4025 
4026     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4027     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4028   }
4029 
4030   for (int I = 1; I != NumVAddrs; ++I) {
4031     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4032     if (SrcOp.isReg())
4033       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4034   }
4035 }
4036 
4037 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4038 ///
4039 /// Depending on the subtarget, load/store with 16-bit element data need to be
4040 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4041 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4042 /// registers.
4043 ///
4044 /// We don't want to directly select image instructions just yet, but also want
4045 /// to exposes all register repacking to the legalizer/combiners. We also don't
4046 /// want a selected instrution entering RegBankSelect. In order to avoid
4047 /// defining a multitude of intermediate image instructions, directly hack on
4048 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4049 /// now unnecessary arguments with $noreg.
4050 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4051     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4052     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4053 
4054   const unsigned NumDefs = MI.getNumExplicitDefs();
4055   const unsigned ArgOffset = NumDefs + 1;
4056   bool IsTFE = NumDefs == 2;
4057   // We are only processing the operands of d16 image operations on subtargets
4058   // that use the unpacked register layout, or need to repack the TFE result.
4059 
4060   // TODO: Do we need to guard against already legalized intrinsics?
4061   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4062       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4063 
4064   MachineRegisterInfo *MRI = B.getMRI();
4065   const LLT S32 = LLT::scalar(32);
4066   const LLT S16 = LLT::scalar(16);
4067   const LLT V2S16 = LLT::vector(2, 16);
4068 
4069   unsigned DMask = 0;
4070 
4071   // Check for 16 bit addresses and pack if true.
4072   LLT GradTy =
4073       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4074   LLT AddrTy =
4075       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4076   const bool IsG16 = GradTy == S16;
4077   const bool IsA16 = AddrTy == S16;
4078 
4079   int DMaskLanes = 0;
4080   if (!BaseOpcode->Atomic) {
4081     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4082     if (BaseOpcode->Gather4) {
4083       DMaskLanes = 4;
4084     } else if (DMask != 0) {
4085       DMaskLanes = countPopulation(DMask);
4086     } else if (!IsTFE && !BaseOpcode->Store) {
4087       // If dmask is 0, this is a no-op load. This can be eliminated.
4088       B.buildUndef(MI.getOperand(0));
4089       MI.eraseFromParent();
4090       return true;
4091     }
4092   }
4093 
4094   Observer.changingInstr(MI);
4095   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4096 
4097   unsigned NewOpcode = NumDefs == 0 ?
4098     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4099 
4100   // Track that we legalized this
4101   MI.setDesc(B.getTII().get(NewOpcode));
4102 
4103   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4104   // dmask to be at least 1 otherwise the instruction will fail
4105   if (IsTFE && DMask == 0) {
4106     DMask = 0x1;
4107     DMaskLanes = 1;
4108     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4109   }
4110 
4111   if (BaseOpcode->Atomic) {
4112     Register VData0 = MI.getOperand(2).getReg();
4113     LLT Ty = MRI->getType(VData0);
4114 
4115     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4116     if (Ty.isVector())
4117       return false;
4118 
4119     if (BaseOpcode->AtomicX2) {
4120       Register VData1 = MI.getOperand(3).getReg();
4121       // The two values are packed in one register.
4122       LLT PackedTy = LLT::vector(2, Ty);
4123       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4124       MI.getOperand(2).setReg(Concat.getReg(0));
4125       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4126     }
4127   }
4128 
4129   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4130 
4131   // Optimize _L to _LZ when _L is zero
4132   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4133           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4134     const ConstantFP *ConstantLod;
4135 
4136     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4137                  m_GFCst(ConstantLod))) {
4138       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4139         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4140         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4141             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4142                                                       Intr->Dim);
4143 
4144         // The starting indexes should remain in the same place.
4145         --CorrectedNumVAddrs;
4146 
4147         MI.getOperand(MI.getNumExplicitDefs())
4148             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4149         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4150         Intr = NewImageDimIntr;
4151       }
4152     }
4153   }
4154 
4155   // Optimize _mip away, when 'lod' is zero
4156   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4157     int64_t ConstantLod;
4158     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4159                  m_ICst(ConstantLod))) {
4160       if (ConstantLod == 0) {
4161         // TODO: Change intrinsic opcode and remove operand instead or replacing
4162         // it with 0, as the _L to _LZ handling is done above.
4163         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4164         --CorrectedNumVAddrs;
4165       }
4166     }
4167   }
4168 
4169   // Rewrite the addressing register layout before doing anything else.
4170   if (IsA16 || IsG16) {
4171     if (IsA16) {
4172       // Target must support the feature and gradients need to be 16 bit too
4173       if (!ST.hasA16() || !IsG16)
4174         return false;
4175     } else if (!ST.hasG16())
4176       return false;
4177 
4178     if (Intr->NumVAddrs > 1) {
4179       SmallVector<Register, 4> PackedRegs;
4180       // Don't compress addresses for G16
4181       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4182       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4183                                   PackEndIdx);
4184 
4185       if (!IsA16) {
4186         // Add uncompressed address
4187         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4188           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4189           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4190           PackedRegs.push_back(AddrReg);
4191         }
4192       }
4193 
4194       // See also below in the non-a16 branch
4195       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4196 
4197       if (!UseNSA && PackedRegs.size() > 1) {
4198         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4199         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4200         PackedRegs[0] = Concat.getReg(0);
4201         PackedRegs.resize(1);
4202       }
4203 
4204       const unsigned NumPacked = PackedRegs.size();
4205       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4206         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4207         if (!SrcOp.isReg()) {
4208           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4209           continue;
4210         }
4211 
4212         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4213 
4214         if (I - Intr->VAddrStart < NumPacked)
4215           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4216         else
4217           SrcOp.setReg(AMDGPU::NoRegister);
4218       }
4219     }
4220   } else {
4221     // If the register allocator cannot place the address registers contiguously
4222     // without introducing moves, then using the non-sequential address encoding
4223     // is always preferable, since it saves VALU instructions and is usually a
4224     // wash in terms of code size or even better.
4225     //
4226     // However, we currently have no way of hinting to the register allocator
4227     // that MIMG addresses should be placed contiguously when it is possible to
4228     // do so, so force non-NSA for the common 2-address case as a heuristic.
4229     //
4230     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4231     // allocation when possible.
4232     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4233 
4234     if (!UseNSA && Intr->NumVAddrs > 1)
4235       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4236                                Intr->NumVAddrs);
4237   }
4238 
4239   int Flags = 0;
4240   if (IsA16)
4241     Flags |= 1;
4242   if (IsG16)
4243     Flags |= 2;
4244   MI.addOperand(MachineOperand::CreateImm(Flags));
4245 
4246   if (BaseOpcode->Store) { // No TFE for stores?
4247     // TODO: Handle dmask trim
4248     Register VData = MI.getOperand(1).getReg();
4249     LLT Ty = MRI->getType(VData);
4250     if (!Ty.isVector() || Ty.getElementType() != S16)
4251       return true;
4252 
4253     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4254     if (RepackedReg != VData) {
4255       MI.getOperand(1).setReg(RepackedReg);
4256     }
4257 
4258     return true;
4259   }
4260 
4261   Register DstReg = MI.getOperand(0).getReg();
4262   LLT Ty = MRI->getType(DstReg);
4263   const LLT EltTy = Ty.getScalarType();
4264   const bool IsD16 = Ty.getScalarType() == S16;
4265   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4266 
4267   // Confirm that the return type is large enough for the dmask specified
4268   if (NumElts < DMaskLanes)
4269     return false;
4270 
4271   if (NumElts > 4 || DMaskLanes > 4)
4272     return false;
4273 
4274   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4275   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4276 
4277   // The raw dword aligned data component of the load. The only legal cases
4278   // where this matters should be when using the packed D16 format, for
4279   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4280   LLT RoundedTy;
4281 
4282   // S32 vector to to cover all data, plus TFE result element.
4283   LLT TFETy;
4284 
4285   // Register type to use for each loaded component. Will be S32 or V2S16.
4286   LLT RegTy;
4287 
4288   if (IsD16 && ST.hasUnpackedD16VMem()) {
4289     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4290     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4291     RegTy = S32;
4292   } else {
4293     unsigned EltSize = EltTy.getSizeInBits();
4294     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4295     unsigned RoundedSize = 32 * RoundedElts;
4296     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4297     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4298     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4299   }
4300 
4301   // The return type does not need adjustment.
4302   // TODO: Should we change s16 case to s32 or <2 x s16>?
4303   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4304     return true;
4305 
4306   Register Dst1Reg;
4307 
4308   // Insert after the instruction.
4309   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4310 
4311   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4312   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4313   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4314   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4315 
4316   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4317 
4318   MI.getOperand(0).setReg(NewResultReg);
4319 
4320   // In the IR, TFE is supposed to be used with a 2 element struct return
4321   // type. The intruction really returns these two values in one contiguous
4322   // register, with one additional dword beyond the loaded data. Rewrite the
4323   // return type to use a single register result.
4324 
4325   if (IsTFE) {
4326     Dst1Reg = MI.getOperand(1).getReg();
4327     if (MRI->getType(Dst1Reg) != S32)
4328       return false;
4329 
4330     // TODO: Make sure the TFE operand bit is set.
4331     MI.RemoveOperand(1);
4332 
4333     // Handle the easy case that requires no repack instructions.
4334     if (Ty == S32) {
4335       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4336       return true;
4337     }
4338   }
4339 
4340   // Now figure out how to copy the new result register back into the old
4341   // result.
4342   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4343 
4344   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4345 
4346   if (ResultNumRegs == 1) {
4347     assert(!IsTFE);
4348     ResultRegs[0] = NewResultReg;
4349   } else {
4350     // We have to repack into a new vector of some kind.
4351     for (int I = 0; I != NumDataRegs; ++I)
4352       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4353     B.buildUnmerge(ResultRegs, NewResultReg);
4354 
4355     // Drop the final TFE element to get the data part. The TFE result is
4356     // directly written to the right place already.
4357     if (IsTFE)
4358       ResultRegs.resize(NumDataRegs);
4359   }
4360 
4361   // For an s16 scalar result, we form an s32 result with a truncate regardless
4362   // of packed vs. unpacked.
4363   if (IsD16 && !Ty.isVector()) {
4364     B.buildTrunc(DstReg, ResultRegs[0]);
4365     return true;
4366   }
4367 
4368   // Avoid a build/concat_vector of 1 entry.
4369   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4370     B.buildBitcast(DstReg, ResultRegs[0]);
4371     return true;
4372   }
4373 
4374   assert(Ty.isVector());
4375 
4376   if (IsD16) {
4377     // For packed D16 results with TFE enabled, all the data components are
4378     // S32. Cast back to the expected type.
4379     //
4380     // TODO: We don't really need to use load s32 elements. We would only need one
4381     // cast for the TFE result if a multiple of v2s16 was used.
4382     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4383       for (Register &Reg : ResultRegs)
4384         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4385     } else if (ST.hasUnpackedD16VMem()) {
4386       for (Register &Reg : ResultRegs)
4387         Reg = B.buildTrunc(S16, Reg).getReg(0);
4388     }
4389   }
4390 
4391   auto padWithUndef = [&](LLT Ty, int NumElts) {
4392     if (NumElts == 0)
4393       return;
4394     Register Undef = B.buildUndef(Ty).getReg(0);
4395     for (int I = 0; I != NumElts; ++I)
4396       ResultRegs.push_back(Undef);
4397   };
4398 
4399   // Pad out any elements eliminated due to the dmask.
4400   LLT ResTy = MRI->getType(ResultRegs[0]);
4401   if (!ResTy.isVector()) {
4402     padWithUndef(ResTy, NumElts - ResultRegs.size());
4403     B.buildBuildVector(DstReg, ResultRegs);
4404     return true;
4405   }
4406 
4407   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4408   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4409 
4410   // Deal with the one annoying legal case.
4411   const LLT V3S16 = LLT::vector(3, 16);
4412   if (Ty == V3S16) {
4413     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4414     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4415     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4416     return true;
4417   }
4418 
4419   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4420   B.buildConcatVectors(DstReg, ResultRegs);
4421   return true;
4422 }
4423 
4424 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4425   LegalizerHelper &Helper, MachineInstr &MI) const {
4426   MachineIRBuilder &B = Helper.MIRBuilder;
4427   GISelChangeObserver &Observer = Helper.Observer;
4428 
4429   Register Dst = MI.getOperand(0).getReg();
4430   LLT Ty = B.getMRI()->getType(Dst);
4431   unsigned Size = Ty.getSizeInBits();
4432   MachineFunction &MF = B.getMF();
4433 
4434   Observer.changingInstr(MI);
4435 
4436   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4437     Ty = getBitcastRegisterType(Ty);
4438     Helper.bitcastDst(MI, Ty, 0);
4439     Dst = MI.getOperand(0).getReg();
4440     B.setInsertPt(B.getMBB(), MI);
4441   }
4442 
4443   // FIXME: We don't really need this intermediate instruction. The intrinsic
4444   // should be fixed to have a memory operand. Since it's readnone, we're not
4445   // allowed to add one.
4446   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4447   MI.RemoveOperand(1); // Remove intrinsic ID
4448 
4449   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4450   // TODO: Should this use datalayout alignment?
4451   const unsigned MemSize = (Size + 7) / 8;
4452   const Align MemAlign(4);
4453   MachineMemOperand *MMO = MF.getMachineMemOperand(
4454       MachinePointerInfo(),
4455       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4456           MachineMemOperand::MOInvariant,
4457       MemSize, MemAlign);
4458   MI.addMemOperand(MF, MMO);
4459 
4460   // There are no 96-bit result scalar loads, but widening to 128-bit should
4461   // always be legal. We may need to restore this to a 96-bit result if it turns
4462   // out this needs to be converted to a vector load during RegBankSelect.
4463   if (!isPowerOf2_32(Size)) {
4464     if (Ty.isVector())
4465       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4466     else
4467       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4468   }
4469 
4470   Observer.changedInstr(MI);
4471   return true;
4472 }
4473 
4474 // TODO: Move to selection
4475 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4476                                                 MachineRegisterInfo &MRI,
4477                                                 MachineIRBuilder &B) const {
4478   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4479   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4480       !ST.isTrapHandlerEnabled()) {
4481     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4482   } else {
4483     // Pass queue pointer to trap handler as input, and insert trap instruction
4484     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4485     MachineRegisterInfo &MRI = *B.getMRI();
4486 
4487     Register LiveIn =
4488       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4489     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4490       return false;
4491 
4492     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4493     B.buildCopy(SGPR01, LiveIn);
4494     B.buildInstr(AMDGPU::S_TRAP)
4495         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4496         .addReg(SGPR01, RegState::Implicit);
4497   }
4498 
4499   MI.eraseFromParent();
4500   return true;
4501 }
4502 
4503 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4504     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4505   // Is non-HSA path or trap-handler disabled? then, report a warning
4506   // accordingly
4507   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4508       !ST.isTrapHandlerEnabled()) {
4509     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4510                                      "debugtrap handler not supported",
4511                                      MI.getDebugLoc(), DS_Warning);
4512     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4513     Ctx.diagnose(NoTrap);
4514   } else {
4515     // Insert debug-trap instruction
4516     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4517   }
4518 
4519   MI.eraseFromParent();
4520   return true;
4521 }
4522 
4523 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4524                                                MachineIRBuilder &B) const {
4525   MachineRegisterInfo &MRI = *B.getMRI();
4526   const LLT S16 = LLT::scalar(16);
4527   const LLT S32 = LLT::scalar(32);
4528 
4529   Register DstReg = MI.getOperand(0).getReg();
4530   Register NodePtr = MI.getOperand(2).getReg();
4531   Register RayExtent = MI.getOperand(3).getReg();
4532   Register RayOrigin = MI.getOperand(4).getReg();
4533   Register RayDir = MI.getOperand(5).getReg();
4534   Register RayInvDir = MI.getOperand(6).getReg();
4535   Register TDescr = MI.getOperand(7).getReg();
4536 
4537   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4538   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4539   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4540                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4541                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4542                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4543 
4544   SmallVector<Register, 12> Ops;
4545   if (Is64) {
4546     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4547     Ops.push_back(Unmerge.getReg(0));
4548     Ops.push_back(Unmerge.getReg(1));
4549   } else {
4550     Ops.push_back(NodePtr);
4551   }
4552   Ops.push_back(RayExtent);
4553 
4554   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4555     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4556     Ops.push_back(Unmerge.getReg(0));
4557     Ops.push_back(Unmerge.getReg(1));
4558     Ops.push_back(Unmerge.getReg(2));
4559   };
4560 
4561   packLanes(RayOrigin);
4562   if (IsA16) {
4563     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4564     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4565     Register R1 = MRI.createGenericVirtualRegister(S32);
4566     Register R2 = MRI.createGenericVirtualRegister(S32);
4567     Register R3 = MRI.createGenericVirtualRegister(S32);
4568     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4569     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4570     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4571     Ops.push_back(R1);
4572     Ops.push_back(R2);
4573     Ops.push_back(R3);
4574   } else {
4575     packLanes(RayDir);
4576     packLanes(RayInvDir);
4577   }
4578 
4579   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4580     .addDef(DstReg)
4581     .addImm(Opcode);
4582 
4583   for (Register R : Ops) {
4584     MIB.addUse(R);
4585   }
4586 
4587   MIB.addUse(TDescr)
4588      .addImm(IsA16 ? 1 : 0)
4589      .cloneMemRefs(MI);
4590 
4591   MI.eraseFromParent();
4592   return true;
4593 }
4594 
4595 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4596                                             MachineInstr &MI) const {
4597   MachineIRBuilder &B = Helper.MIRBuilder;
4598   MachineRegisterInfo &MRI = *B.getMRI();
4599 
4600   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4601   auto IntrID = MI.getIntrinsicID();
4602   switch (IntrID) {
4603   case Intrinsic::amdgcn_if:
4604   case Intrinsic::amdgcn_else: {
4605     MachineInstr *Br = nullptr;
4606     MachineBasicBlock *UncondBrTarget = nullptr;
4607     bool Negated = false;
4608     if (MachineInstr *BrCond =
4609             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4610       const SIRegisterInfo *TRI
4611         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4612 
4613       Register Def = MI.getOperand(1).getReg();
4614       Register Use = MI.getOperand(3).getReg();
4615 
4616       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4617 
4618       if (Negated)
4619         std::swap(CondBrTarget, UncondBrTarget);
4620 
4621       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4622       if (IntrID == Intrinsic::amdgcn_if) {
4623         B.buildInstr(AMDGPU::SI_IF)
4624           .addDef(Def)
4625           .addUse(Use)
4626           .addMBB(UncondBrTarget);
4627       } else {
4628         B.buildInstr(AMDGPU::SI_ELSE)
4629             .addDef(Def)
4630             .addUse(Use)
4631             .addMBB(UncondBrTarget);
4632       }
4633 
4634       if (Br) {
4635         Br->getOperand(0).setMBB(CondBrTarget);
4636       } else {
4637         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4638         // since we're swapping branch targets it needs to be reinserted.
4639         // FIXME: IRTranslator should probably not do this
4640         B.buildBr(*CondBrTarget);
4641       }
4642 
4643       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4644       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4645       MI.eraseFromParent();
4646       BrCond->eraseFromParent();
4647       return true;
4648     }
4649 
4650     return false;
4651   }
4652   case Intrinsic::amdgcn_loop: {
4653     MachineInstr *Br = nullptr;
4654     MachineBasicBlock *UncondBrTarget = nullptr;
4655     bool Negated = false;
4656     if (MachineInstr *BrCond =
4657             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4658       const SIRegisterInfo *TRI
4659         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4660 
4661       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4662       Register Reg = MI.getOperand(2).getReg();
4663 
4664       if (Negated)
4665         std::swap(CondBrTarget, UncondBrTarget);
4666 
4667       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4668       B.buildInstr(AMDGPU::SI_LOOP)
4669         .addUse(Reg)
4670         .addMBB(UncondBrTarget);
4671 
4672       if (Br)
4673         Br->getOperand(0).setMBB(CondBrTarget);
4674       else
4675         B.buildBr(*CondBrTarget);
4676 
4677       MI.eraseFromParent();
4678       BrCond->eraseFromParent();
4679       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4680       return true;
4681     }
4682 
4683     return false;
4684   }
4685   case Intrinsic::amdgcn_kernarg_segment_ptr:
4686     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4687       // This only makes sense to call in a kernel, so just lower to null.
4688       B.buildConstant(MI.getOperand(0).getReg(), 0);
4689       MI.eraseFromParent();
4690       return true;
4691     }
4692 
4693     return legalizePreloadedArgIntrin(
4694       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4695   case Intrinsic::amdgcn_implicitarg_ptr:
4696     return legalizeImplicitArgPtr(MI, MRI, B);
4697   case Intrinsic::amdgcn_workitem_id_x:
4698     return legalizePreloadedArgIntrin(MI, MRI, B,
4699                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4700   case Intrinsic::amdgcn_workitem_id_y:
4701     return legalizePreloadedArgIntrin(MI, MRI, B,
4702                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4703   case Intrinsic::amdgcn_workitem_id_z:
4704     return legalizePreloadedArgIntrin(MI, MRI, B,
4705                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4706   case Intrinsic::amdgcn_workgroup_id_x:
4707     return legalizePreloadedArgIntrin(MI, MRI, B,
4708                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4709   case Intrinsic::amdgcn_workgroup_id_y:
4710     return legalizePreloadedArgIntrin(MI, MRI, B,
4711                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4712   case Intrinsic::amdgcn_workgroup_id_z:
4713     return legalizePreloadedArgIntrin(MI, MRI, B,
4714                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4715   case Intrinsic::amdgcn_dispatch_ptr:
4716     return legalizePreloadedArgIntrin(MI, MRI, B,
4717                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4718   case Intrinsic::amdgcn_queue_ptr:
4719     return legalizePreloadedArgIntrin(MI, MRI, B,
4720                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4721   case Intrinsic::amdgcn_implicit_buffer_ptr:
4722     return legalizePreloadedArgIntrin(
4723       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4724   case Intrinsic::amdgcn_dispatch_id:
4725     return legalizePreloadedArgIntrin(MI, MRI, B,
4726                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4727   case Intrinsic::amdgcn_fdiv_fast:
4728     return legalizeFDIVFastIntrin(MI, MRI, B);
4729   case Intrinsic::amdgcn_is_shared:
4730     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4731   case Intrinsic::amdgcn_is_private:
4732     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4733   case Intrinsic::amdgcn_wavefrontsize: {
4734     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4735     MI.eraseFromParent();
4736     return true;
4737   }
4738   case Intrinsic::amdgcn_s_buffer_load:
4739     return legalizeSBufferLoad(Helper, MI);
4740   case Intrinsic::amdgcn_raw_buffer_store:
4741   case Intrinsic::amdgcn_struct_buffer_store:
4742     return legalizeBufferStore(MI, MRI, B, false, false);
4743   case Intrinsic::amdgcn_raw_buffer_store_format:
4744   case Intrinsic::amdgcn_struct_buffer_store_format:
4745     return legalizeBufferStore(MI, MRI, B, false, true);
4746   case Intrinsic::amdgcn_raw_tbuffer_store:
4747   case Intrinsic::amdgcn_struct_tbuffer_store:
4748     return legalizeBufferStore(MI, MRI, B, true, true);
4749   case Intrinsic::amdgcn_raw_buffer_load:
4750   case Intrinsic::amdgcn_struct_buffer_load:
4751     return legalizeBufferLoad(MI, MRI, B, false, false);
4752   case Intrinsic::amdgcn_raw_buffer_load_format:
4753   case Intrinsic::amdgcn_struct_buffer_load_format:
4754     return legalizeBufferLoad(MI, MRI, B, true, false);
4755   case Intrinsic::amdgcn_raw_tbuffer_load:
4756   case Intrinsic::amdgcn_struct_tbuffer_load:
4757     return legalizeBufferLoad(MI, MRI, B, true, true);
4758   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4759   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4760   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4761   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4762   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4763   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4764   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4765   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4766   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4767   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4768   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4769   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4770   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4771   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4772   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4773   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4774   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4775   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4776   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4777   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4778   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4779   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4780   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4781   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4782   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4783   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4784   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4785   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4786     return legalizeBufferAtomic(MI, B, IntrID);
4787   case Intrinsic::amdgcn_atomic_inc:
4788     return legalizeAtomicIncDec(MI, B, true);
4789   case Intrinsic::amdgcn_atomic_dec:
4790     return legalizeAtomicIncDec(MI, B, false);
4791   case Intrinsic::trap:
4792     return legalizeTrapIntrinsic(MI, MRI, B);
4793   case Intrinsic::debugtrap:
4794     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4795   case Intrinsic::amdgcn_rsq_clamp:
4796     return legalizeRsqClampIntrinsic(MI, MRI, B);
4797   case Intrinsic::amdgcn_ds_fadd:
4798   case Intrinsic::amdgcn_ds_fmin:
4799   case Intrinsic::amdgcn_ds_fmax:
4800     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4801   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4802     return legalizeBVHIntrinsic(MI, B);
4803   default: {
4804     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4805             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4806       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4807     return true;
4808   }
4809   }
4810 
4811   return true;
4812 }
4813