1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 /// Return true if we should legalize a load by widening an odd sized memory
364 /// access up to the alignment. Note this case when the memory access itself
365 /// changes, not the size of the result register.
366 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
367                             unsigned AlignInBits, unsigned AddrSpace,
368                             unsigned Opcode) {
369   // We don't want to widen cases that are naturally legal.
370   if (isPowerOf2_32(SizeInBits))
371     return false;
372 
373   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
374   // end up widening these for a scalar load during RegBankSelect, since there
375   // aren't 96-bit scalar loads.
376   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
377     return false;
378 
379   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
380     return false;
381 
382   // A load is known dereferenceable up to the alignment, so it's legal to widen
383   // to it.
384   //
385   // TODO: Could check dereferenceable for less aligned cases.
386   unsigned RoundedSize = NextPowerOf2(SizeInBits);
387   if (AlignInBits < RoundedSize)
388     return false;
389 
390   // Do not widen if it would introduce a slow unaligned load.
391   const SITargetLowering *TLI = ST.getTargetLowering();
392   bool Fast = false;
393   return TLI->allowsMisalignedMemoryAccessesImpl(
394              RoundedSize, AddrSpace, Align(AlignInBits / 8),
395              MachineMemOperand::MOLoad, &Fast) &&
396          Fast;
397 }
398 
399 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
400                             unsigned Opcode) {
401   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
402     return false;
403 
404   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
405                          Query.MMODescrs[0].AlignInBits,
406                          Query.Types[1].getAddressSpace(), Opcode);
407 }
408 
409 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
410                                          const GCNTargetMachine &TM)
411   :  ST(ST_) {
412   using namespace TargetOpcode;
413 
414   auto GetAddrSpacePtr = [&TM](unsigned AS) {
415     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
416   };
417 
418   const LLT S1 = LLT::scalar(1);
419   const LLT S16 = LLT::scalar(16);
420   const LLT S32 = LLT::scalar(32);
421   const LLT S64 = LLT::scalar(64);
422   const LLT S128 = LLT::scalar(128);
423   const LLT S256 = LLT::scalar(256);
424   const LLT S512 = LLT::scalar(512);
425   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
426 
427   const LLT V2S16 = LLT::vector(2, 16);
428   const LLT V4S16 = LLT::vector(4, 16);
429 
430   const LLT V2S32 = LLT::vector(2, 32);
431   const LLT V3S32 = LLT::vector(3, 32);
432   const LLT V4S32 = LLT::vector(4, 32);
433   const LLT V5S32 = LLT::vector(5, 32);
434   const LLT V6S32 = LLT::vector(6, 32);
435   const LLT V7S32 = LLT::vector(7, 32);
436   const LLT V8S32 = LLT::vector(8, 32);
437   const LLT V9S32 = LLT::vector(9, 32);
438   const LLT V10S32 = LLT::vector(10, 32);
439   const LLT V11S32 = LLT::vector(11, 32);
440   const LLT V12S32 = LLT::vector(12, 32);
441   const LLT V13S32 = LLT::vector(13, 32);
442   const LLT V14S32 = LLT::vector(14, 32);
443   const LLT V15S32 = LLT::vector(15, 32);
444   const LLT V16S32 = LLT::vector(16, 32);
445   const LLT V32S32 = LLT::vector(32, 32);
446 
447   const LLT V2S64 = LLT::vector(2, 64);
448   const LLT V3S64 = LLT::vector(3, 64);
449   const LLT V4S64 = LLT::vector(4, 64);
450   const LLT V5S64 = LLT::vector(5, 64);
451   const LLT V6S64 = LLT::vector(6, 64);
452   const LLT V7S64 = LLT::vector(7, 64);
453   const LLT V8S64 = LLT::vector(8, 64);
454   const LLT V16S64 = LLT::vector(16, 64);
455 
456   std::initializer_list<LLT> AllS32Vectors =
457     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
458      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
459   std::initializer_list<LLT> AllS64Vectors =
460     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
461 
462   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
463   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
464   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
465   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
466   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
467   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
468   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
469 
470   const LLT CodePtr = FlatPtr;
471 
472   const std::initializer_list<LLT> AddrSpaces64 = {
473     GlobalPtr, ConstantPtr, FlatPtr
474   };
475 
476   const std::initializer_list<LLT> AddrSpaces32 = {
477     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
478   };
479 
480   const std::initializer_list<LLT> FPTypesBase = {
481     S32, S64
482   };
483 
484   const std::initializer_list<LLT> FPTypes16 = {
485     S32, S64, S16
486   };
487 
488   const std::initializer_list<LLT> FPTypesPK16 = {
489     S32, S64, S16, V2S16
490   };
491 
492   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
493 
494   setAction({G_BRCOND, S1}, Legal); // VCC branches
495   setAction({G_BRCOND, S32}, Legal); // SCC branches
496 
497   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
498   // elements for v3s16
499   getActionDefinitionsBuilder(G_PHI)
500     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
501     .legalFor(AllS32Vectors)
502     .legalFor(AllS64Vectors)
503     .legalFor(AddrSpaces64)
504     .legalFor(AddrSpaces32)
505     .legalIf(isPointer(0))
506     .clampScalar(0, S16, S256)
507     .widenScalarToNextPow2(0, 32)
508     .clampMaxNumElements(0, S32, 16)
509     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
510     .scalarize(0);
511 
512   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
513     // Full set of gfx9 features.
514     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
515       .legalFor({S32, S16, V2S16})
516       .clampScalar(0, S16, S32)
517       .clampMaxNumElements(0, S16, 2)
518       .scalarize(0)
519       .widenScalarToNextPow2(0, 32);
520 
521     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
522       .legalFor({S32, S16, V2S16}) // Clamp modifier
523       .minScalarOrElt(0, S16)
524       .clampMaxNumElements(0, S16, 2)
525       .scalarize(0)
526       .widenScalarToNextPow2(0, 32)
527       .lower();
528   } else if (ST.has16BitInsts()) {
529     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
530       .legalFor({S32, S16})
531       .clampScalar(0, S16, S32)
532       .scalarize(0)
533       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
534 
535     // Technically the saturating operations require clamp bit support, but this
536     // was introduced at the same time as 16-bit operations.
537     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
538       .legalFor({S32, S16}) // Clamp modifier
539       .minScalar(0, S16)
540       .scalarize(0)
541       .widenScalarToNextPow2(0, 16)
542       .lower();
543 
544     // We're just lowering this, but it helps get a better result to try to
545     // coerce to the desired type first.
546     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
547       .minScalar(0, S16)
548       .scalarize(0)
549       .lower();
550   } else {
551     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
552       .legalFor({S32})
553       .clampScalar(0, S32, S32)
554       .scalarize(0);
555 
556     if (ST.hasIntClamp()) {
557       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
558         .legalFor({S32}) // Clamp modifier.
559         .scalarize(0)
560         .minScalarOrElt(0, S32)
561         .lower();
562     } else {
563       // Clamp bit support was added in VI, along with 16-bit operations.
564       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
565         .minScalar(0, S32)
566         .scalarize(0)
567         .lower();
568     }
569 
570     // FIXME: DAG expansion gets better results. The widening uses the smaller
571     // range values and goes for the min/max lowering directly.
572     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
573       .minScalar(0, S32)
574       .scalarize(0)
575       .lower();
576   }
577 
578   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
579     .customFor({S32, S64})
580     .clampScalar(0, S32, S64)
581     .widenScalarToNextPow2(0, 32)
582     .scalarize(0);
583 
584   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
585     .legalFor({S32})
586     .clampScalar(0, S32, S32)
587     .scalarize(0);
588 
589   // Report legal for any types we can handle anywhere. For the cases only legal
590   // on the SALU, RegBankSelect will be able to re-legalize.
591   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
592     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
593     .clampScalar(0, S32, S64)
594     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
595     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
596     .widenScalarToNextPow2(0)
597     .scalarize(0);
598 
599   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
600                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
601     .legalFor({{S32, S1}, {S32, S32}})
602     .minScalar(0, S32)
603     // TODO: .scalarize(0)
604     .lower();
605 
606   getActionDefinitionsBuilder(G_BITCAST)
607     // Don't worry about the size constraint.
608     .legalIf(all(isRegisterType(0), isRegisterType(1)))
609     .lower();
610 
611 
612   getActionDefinitionsBuilder(G_CONSTANT)
613     .legalFor({S1, S32, S64, S16, GlobalPtr,
614                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
615     .legalIf(isPointer(0))
616     .clampScalar(0, S32, S64)
617     .widenScalarToNextPow2(0);
618 
619   getActionDefinitionsBuilder(G_FCONSTANT)
620     .legalFor({S32, S64, S16})
621     .clampScalar(0, S16, S64);
622 
623   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
624       .legalIf(isRegisterType(0))
625       // s1 and s16 are special cases because they have legal operations on
626       // them, but don't really occupy registers in the normal way.
627       .legalFor({S1, S16})
628       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
629       .clampScalarOrElt(0, S32, MaxScalar)
630       .widenScalarToNextPow2(0, 32)
631       .clampMaxNumElements(0, S32, 16);
632 
633   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
634 
635   // If the amount is divergent, we have to do a wave reduction to get the
636   // maximum value, so this is expanded during RegBankSelect.
637   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
638     .legalFor({{PrivatePtr, S32}});
639 
640   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
641     .customIf(typeIsNot(0, PrivatePtr));
642 
643   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
644 
645   auto &FPOpActions = getActionDefinitionsBuilder(
646     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
647     .legalFor({S32, S64});
648   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
649     .customFor({S32, S64});
650   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
651     .customFor({S32, S64});
652 
653   if (ST.has16BitInsts()) {
654     if (ST.hasVOP3PInsts())
655       FPOpActions.legalFor({S16, V2S16});
656     else
657       FPOpActions.legalFor({S16});
658 
659     TrigActions.customFor({S16});
660     FDIVActions.customFor({S16});
661   }
662 
663   auto &MinNumMaxNum = getActionDefinitionsBuilder({
664       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
665 
666   if (ST.hasVOP3PInsts()) {
667     MinNumMaxNum.customFor(FPTypesPK16)
668       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
669       .clampMaxNumElements(0, S16, 2)
670       .clampScalar(0, S16, S64)
671       .scalarize(0);
672   } else if (ST.has16BitInsts()) {
673     MinNumMaxNum.customFor(FPTypes16)
674       .clampScalar(0, S16, S64)
675       .scalarize(0);
676   } else {
677     MinNumMaxNum.customFor(FPTypesBase)
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   }
681 
682   if (ST.hasVOP3PInsts())
683     FPOpActions.clampMaxNumElements(0, S16, 2);
684 
685   FPOpActions
686     .scalarize(0)
687     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
688 
689   TrigActions
690     .scalarize(0)
691     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
692 
693   FDIVActions
694     .scalarize(0)
695     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
696 
697   getActionDefinitionsBuilder({G_FNEG, G_FABS})
698     .legalFor(FPTypesPK16)
699     .clampMaxNumElements(0, S16, 2)
700     .scalarize(0)
701     .clampScalar(0, S16, S64);
702 
703   if (ST.has16BitInsts()) {
704     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
705       .legalFor({S32, S64, S16})
706       .scalarize(0)
707       .clampScalar(0, S16, S64);
708   } else {
709     getActionDefinitionsBuilder(G_FSQRT)
710       .legalFor({S32, S64})
711       .scalarize(0)
712       .clampScalar(0, S32, S64);
713 
714     if (ST.hasFractBug()) {
715       getActionDefinitionsBuilder(G_FFLOOR)
716         .customFor({S64})
717         .legalFor({S32, S64})
718         .scalarize(0)
719         .clampScalar(0, S32, S64);
720     } else {
721       getActionDefinitionsBuilder(G_FFLOOR)
722         .legalFor({S32, S64})
723         .scalarize(0)
724         .clampScalar(0, S32, S64);
725     }
726   }
727 
728   getActionDefinitionsBuilder(G_FPTRUNC)
729     .legalFor({{S32, S64}, {S16, S32}})
730     .scalarize(0)
731     .lower();
732 
733   getActionDefinitionsBuilder(G_FPEXT)
734     .legalFor({{S64, S32}, {S32, S16}})
735     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
736     .scalarize(0);
737 
738   getActionDefinitionsBuilder(G_FSUB)
739       // Use actual fsub instruction
740       .legalFor({S32})
741       // Must use fadd + fneg
742       .lowerFor({S64, S16, V2S16})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746   // Whether this is legal depends on the floating point mode for the function.
747   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
748   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
749     FMad.customFor({S32, S16});
750   else if (ST.hasMadMacF32Insts())
751     FMad.customFor({S32});
752   else if (ST.hasMadF16())
753     FMad.customFor({S16});
754   FMad.scalarize(0)
755       .lower();
756 
757   auto &FRem = getActionDefinitionsBuilder(G_FREM);
758   if (ST.has16BitInsts()) {
759     FRem.customFor({S16, S32, S64});
760   } else {
761     FRem.minScalar(0, S32)
762         .customFor({S32, S64});
763   }
764   FRem.scalarize(0);
765 
766   // TODO: Do we need to clamp maximum bitwidth?
767   getActionDefinitionsBuilder(G_TRUNC)
768     .legalIf(isScalar(0))
769     .legalFor({{V2S16, V2S32}})
770     .clampMaxNumElements(0, S16, 2)
771     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
772     // situations (like an invalid implicit use), we don't want to infinite loop
773     // in the legalizer.
774     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
775     .alwaysLegal();
776 
777   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
778     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
779                {S32, S1}, {S64, S1}, {S16, S1}})
780     .scalarize(0)
781     .clampScalar(0, S32, S64)
782     .widenScalarToNextPow2(1, 32);
783 
784   // TODO: Split s1->s64 during regbankselect for VALU.
785   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
786     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
787     .lowerFor({{S32, S64}})
788     .lowerIf(typeIs(1, S1))
789     .customFor({{S64, S64}});
790   if (ST.has16BitInsts())
791     IToFP.legalFor({{S16, S16}});
792   IToFP.clampScalar(1, S32, S64)
793        .minScalar(0, S32)
794        .scalarize(0)
795        .widenScalarToNextPow2(1);
796 
797   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
798     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
799     .customFor({{S64, S64}})
800     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
801   if (ST.has16BitInsts())
802     FPToI.legalFor({{S16, S16}});
803   else
804     FPToI.minScalar(1, S32);
805 
806   FPToI.minScalar(0, S32)
807        .scalarize(0)
808        .lower();
809 
810   // Lower roundeven into G_FRINT
811   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
812     .scalarize(0)
813     .lower();
814 
815   if (ST.has16BitInsts()) {
816     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
817       .legalFor({S16, S32, S64})
818       .clampScalar(0, S16, S64)
819       .scalarize(0);
820   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
821     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822       .legalFor({S32, S64})
823       .clampScalar(0, S32, S64)
824       .scalarize(0);
825   } else {
826     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827       .legalFor({S32})
828       .customFor({S64})
829       .clampScalar(0, S32, S64)
830       .scalarize(0);
831   }
832 
833   getActionDefinitionsBuilder(G_PTR_ADD)
834     .legalIf(all(isPointer(0), sameSize(0, 1)))
835     .scalarize(0)
836     .scalarSameSizeAs(1, 0);
837 
838   getActionDefinitionsBuilder(G_PTRMASK)
839     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
840     .scalarSameSizeAs(1, 0)
841     .scalarize(0);
842 
843   auto &CmpBuilder =
844     getActionDefinitionsBuilder(G_ICMP)
845     // The compare output type differs based on the register bank of the output,
846     // so make both s1 and s32 legal.
847     //
848     // Scalar compares producing output in scc will be promoted to s32, as that
849     // is the allocatable register type that will be needed for the copy from
850     // scc. This will be promoted during RegBankSelect, and we assume something
851     // before that won't try to use s32 result types.
852     //
853     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
854     // bank.
855     .legalForCartesianProduct(
856       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
857     .legalForCartesianProduct(
858       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
859   if (ST.has16BitInsts()) {
860     CmpBuilder.legalFor({{S1, S16}});
861   }
862 
863   CmpBuilder
864     .widenScalarToNextPow2(1)
865     .clampScalar(1, S32, S64)
866     .scalarize(0)
867     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
868 
869   getActionDefinitionsBuilder(G_FCMP)
870     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
871     .widenScalarToNextPow2(1)
872     .clampScalar(1, S32, S64)
873     .scalarize(0);
874 
875   // FIXME: fpow has a selection pattern that should move to custom lowering.
876   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
877   if (ST.has16BitInsts())
878     Exp2Ops.legalFor({S32, S16});
879   else
880     Exp2Ops.legalFor({S32});
881   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
882   Exp2Ops.scalarize(0);
883 
884   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
885   if (ST.has16BitInsts())
886     ExpOps.customFor({{S32}, {S16}});
887   else
888     ExpOps.customFor({S32});
889   ExpOps.clampScalar(0, MinScalarFPTy, S32)
890         .scalarize(0);
891 
892   getActionDefinitionsBuilder(G_FPOWI)
893     .clampScalar(0, MinScalarFPTy, S32)
894     .lower();
895 
896   // The 64-bit versions produce 32-bit results, but only on the SALU.
897   getActionDefinitionsBuilder(G_CTPOP)
898     .legalFor({{S32, S32}, {S32, S64}})
899     .clampScalar(0, S32, S32)
900     .clampScalar(1, S32, S64)
901     .scalarize(0)
902     .widenScalarToNextPow2(0, 32)
903     .widenScalarToNextPow2(1, 32);
904 
905   // The hardware instructions return a different result on 0 than the generic
906   // instructions expect. The hardware produces -1, but these produce the
907   // bitwidth.
908   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
909     .scalarize(0)
910     .clampScalar(0, S32, S32)
911     .clampScalar(1, S32, S64)
912     .widenScalarToNextPow2(0, 32)
913     .widenScalarToNextPow2(1, 32)
914     .lower();
915 
916   // The 64-bit versions produce 32-bit results, but only on the SALU.
917   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
918     .legalFor({{S32, S32}, {S32, S64}})
919     .clampScalar(0, S32, S32)
920     .clampScalar(1, S32, S64)
921     .scalarize(0)
922     .widenScalarToNextPow2(0, 32)
923     .widenScalarToNextPow2(1, 32);
924 
925   getActionDefinitionsBuilder(G_BITREVERSE)
926     .legalFor({S32})
927     .clampScalar(0, S32, S32)
928     .scalarize(0);
929 
930   if (ST.has16BitInsts()) {
931     getActionDefinitionsBuilder(G_BSWAP)
932       .legalFor({S16, S32, V2S16})
933       .clampMaxNumElements(0, S16, 2)
934       // FIXME: Fixing non-power-of-2 before clamp is workaround for
935       // narrowScalar limitation.
936       .widenScalarToNextPow2(0)
937       .clampScalar(0, S16, S32)
938       .scalarize(0);
939 
940     if (ST.hasVOP3PInsts()) {
941       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
942         .legalFor({S32, S16, V2S16})
943         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
944         .clampMaxNumElements(0, S16, 2)
945         .minScalar(0, S16)
946         .widenScalarToNextPow2(0)
947         .scalarize(0)
948         .lower();
949     } else {
950       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
951         .legalFor({S32, S16})
952         .widenScalarToNextPow2(0)
953         .minScalar(0, S16)
954         .scalarize(0)
955         .lower();
956     }
957   } else {
958     // TODO: Should have same legality without v_perm_b32
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S32})
961       .lowerIf(scalarNarrowerThan(0, 32))
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .maxScalar(0, S32)
966       .scalarize(0)
967       .lower();
968 
969     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970       .legalFor({S32})
971       .minScalar(0, S32)
972       .widenScalarToNextPow2(0)
973       .scalarize(0)
974       .lower();
975   }
976 
977   getActionDefinitionsBuilder(G_INTTOPTR)
978     // List the common cases
979     .legalForCartesianProduct(AddrSpaces64, {S64})
980     .legalForCartesianProduct(AddrSpaces32, {S32})
981     .scalarize(0)
982     // Accept any address space as long as the size matches
983     .legalIf(sameSize(0, 1))
984     .widenScalarIf(smallerThan(1, 0),
985       [](const LegalityQuery &Query) {
986         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
987       })
988     .narrowScalarIf(largerThan(1, 0),
989       [](const LegalityQuery &Query) {
990         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
991       });
992 
993   getActionDefinitionsBuilder(G_PTRTOINT)
994     // List the common cases
995     .legalForCartesianProduct(AddrSpaces64, {S64})
996     .legalForCartesianProduct(AddrSpaces32, {S32})
997     .scalarize(0)
998     // Accept any address space as long as the size matches
999     .legalIf(sameSize(0, 1))
1000     .widenScalarIf(smallerThan(0, 1),
1001       [](const LegalityQuery &Query) {
1002         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1003       })
1004     .narrowScalarIf(
1005       largerThan(0, 1),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008       });
1009 
1010   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1011     .scalarize(0)
1012     .custom();
1013 
1014   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1015                                     bool IsLoad) -> bool {
1016     const LLT DstTy = Query.Types[0];
1017 
1018     // Split vector extloads.
1019     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1021 
1022     if (MemSize < DstTy.getSizeInBits())
1023       MemSize = std::max(MemSize, AlignBits);
1024 
1025     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1026       return true;
1027 
1028     const LLT PtrTy = Query.Types[1];
1029     unsigned AS = PtrTy.getAddressSpace();
1030     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1031       return true;
1032 
1033     // Catch weird sized loads that don't evenly divide into the access sizes
1034     // TODO: May be able to widen depending on alignment etc.
1035     unsigned NumRegs = (MemSize + 31) / 32;
1036     if (NumRegs == 3) {
1037       if (!ST.hasDwordx3LoadStores())
1038         return true;
1039     } else {
1040       // If the alignment allows, these should have been widened.
1041       if (!isPowerOf2_32(NumRegs))
1042         return true;
1043     }
1044 
1045     if (AlignBits < MemSize) {
1046       const SITargetLowering *TLI = ST.getTargetLowering();
1047       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1048                                                       Align(AlignBits / 8));
1049     }
1050 
1051     return false;
1052   };
1053 
1054   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1055   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1056   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1057 
1058   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1059   // LDS
1060   // TODO: Unsupported flat for SI.
1061 
1062   for (unsigned Op : {G_LOAD, G_STORE}) {
1063     const bool IsStore = Op == G_STORE;
1064 
1065     auto &Actions = getActionDefinitionsBuilder(Op);
1066     // Explicitly list some common cases.
1067     // TODO: Does this help compile time at all?
1068     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1069                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1070                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1071                                       {S64, GlobalPtr, 64, GlobalAlign32},
1072                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1073                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1074                                       {S32, GlobalPtr, 8, GlobalAlign8},
1075                                       {S32, GlobalPtr, 16, GlobalAlign16},
1076 
1077                                       {S32, LocalPtr, 32, 32},
1078                                       {S64, LocalPtr, 64, 32},
1079                                       {V2S32, LocalPtr, 64, 32},
1080                                       {S32, LocalPtr, 8, 8},
1081                                       {S32, LocalPtr, 16, 16},
1082                                       {V2S16, LocalPtr, 32, 32},
1083 
1084                                       {S32, PrivatePtr, 32, 32},
1085                                       {S32, PrivatePtr, 8, 8},
1086                                       {S32, PrivatePtr, 16, 16},
1087                                       {V2S16, PrivatePtr, 32, 32},
1088 
1089                                       {S32, ConstantPtr, 32, GlobalAlign32},
1090                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1091                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1092                                       {S64, ConstantPtr, 64, GlobalAlign32},
1093                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1094     Actions.legalIf(
1095       [=](const LegalityQuery &Query) -> bool {
1096         return isLoadStoreLegal(ST, Query, Op);
1097       });
1098 
1099     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1100     // 64-bits.
1101     //
1102     // TODO: Should generalize bitcast action into coerce, which will also cover
1103     // inserting addrspacecasts.
1104     Actions.customIf(typeIs(1, Constant32Ptr));
1105 
1106     // Turn any illegal element vectors into something easier to deal
1107     // with. These will ultimately produce 32-bit scalar shifts to extract the
1108     // parts anyway.
1109     //
1110     // For odd 16-bit element vectors, prefer to split those into pieces with
1111     // 16-bit vector parts.
1112     Actions.bitcastIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1115                                           Query.MMODescrs[0].SizeInBits);
1116       }, bitcastToRegisterType(0));
1117 
1118     if (!IsStore) {
1119       // Widen suitably aligned loads by loading extra bytes. The standard
1120       // legalization actions can't properly express widening memory operands.
1121       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1122         return shouldWidenLoad(ST, Query, G_LOAD);
1123       });
1124     }
1125 
1126     // FIXME: load/store narrowing should be moved to lower action
1127     Actions
1128         .narrowScalarIf(
1129             [=](const LegalityQuery &Query) -> bool {
1130               return !Query.Types[0].isVector() &&
1131                      needToSplitMemOp(Query, Op == G_LOAD);
1132             },
1133             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1134               const LLT DstTy = Query.Types[0];
1135               const LLT PtrTy = Query.Types[1];
1136 
1137               const unsigned DstSize = DstTy.getSizeInBits();
1138               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1139 
1140               // Split extloads.
1141               if (DstSize > MemSize)
1142                 return std::make_pair(0, LLT::scalar(MemSize));
1143 
1144               if (!isPowerOf2_32(DstSize)) {
1145                 // We're probably decomposing an odd sized store. Try to split
1146                 // to the widest type. TODO: Account for alignment. As-is it
1147                 // should be OK, since the new parts will be further legalized.
1148                 unsigned FloorSize = PowerOf2Floor(DstSize);
1149                 return std::make_pair(0, LLT::scalar(FloorSize));
1150               }
1151 
1152               if (DstSize > 32 && (DstSize % 32 != 0)) {
1153                 // FIXME: Need a way to specify non-extload of larger size if
1154                 // suitably aligned.
1155                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1156               }
1157 
1158               unsigned MaxSize = maxSizeForAddrSpace(ST,
1159                                                      PtrTy.getAddressSpace(),
1160                                                      Op == G_LOAD);
1161               if (MemSize > MaxSize)
1162                 return std::make_pair(0, LLT::scalar(MaxSize));
1163 
1164               unsigned Align = Query.MMODescrs[0].AlignInBits;
1165               return std::make_pair(0, LLT::scalar(Align));
1166             })
1167         .fewerElementsIf(
1168             [=](const LegalityQuery &Query) -> bool {
1169               return Query.Types[0].isVector() &&
1170                      needToSplitMemOp(Query, Op == G_LOAD);
1171             },
1172             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1173               const LLT DstTy = Query.Types[0];
1174               const LLT PtrTy = Query.Types[1];
1175 
1176               LLT EltTy = DstTy.getElementType();
1177               unsigned MaxSize = maxSizeForAddrSpace(ST,
1178                                                      PtrTy.getAddressSpace(),
1179                                                      Op == G_LOAD);
1180 
1181               // FIXME: Handle widened to power of 2 results better. This ends
1182               // up scalarizing.
1183               // FIXME: 3 element stores scalarized on SI
1184 
1185               // Split if it's too large for the address space.
1186               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1187                 unsigned NumElts = DstTy.getNumElements();
1188                 unsigned EltSize = EltTy.getSizeInBits();
1189 
1190                 if (MaxSize % EltSize == 0) {
1191                   return std::make_pair(
1192                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1193                 }
1194 
1195                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1196 
1197                 // FIXME: Refine when odd breakdowns handled
1198                 // The scalars will need to be re-legalized.
1199                 if (NumPieces == 1 || NumPieces >= NumElts ||
1200                     NumElts % NumPieces != 0)
1201                   return std::make_pair(0, EltTy);
1202 
1203                 return std::make_pair(0,
1204                                       LLT::vector(NumElts / NumPieces, EltTy));
1205               }
1206 
1207               // FIXME: We could probably handle weird extending loads better.
1208               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1209               if (DstTy.getSizeInBits() > MemSize)
1210                 return std::make_pair(0, EltTy);
1211 
1212               unsigned EltSize = EltTy.getSizeInBits();
1213               unsigned DstSize = DstTy.getSizeInBits();
1214               if (!isPowerOf2_32(DstSize)) {
1215                 // We're probably decomposing an odd sized store. Try to split
1216                 // to the widest type. TODO: Account for alignment. As-is it
1217                 // should be OK, since the new parts will be further legalized.
1218                 unsigned FloorSize = PowerOf2Floor(DstSize);
1219                 return std::make_pair(
1220                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1221               }
1222 
1223               // Need to split because of alignment.
1224               unsigned Align = Query.MMODescrs[0].AlignInBits;
1225               if (EltSize > Align &&
1226                   (EltSize / Align < DstTy.getNumElements())) {
1227                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1228               }
1229 
1230               // May need relegalization for the scalars.
1231               return std::make_pair(0, EltTy);
1232             })
1233     .lowerIfMemSizeNotPow2()
1234     .minScalar(0, S32);
1235 
1236     if (IsStore)
1237       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1238 
1239     Actions
1240         .widenScalarToNextPow2(0)
1241         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1242         .lower();
1243   }
1244 
1245   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1246                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1247                                                   {S32, GlobalPtr, 16, 2 * 8},
1248                                                   {S32, LocalPtr, 8, 8},
1249                                                   {S32, LocalPtr, 16, 16},
1250                                                   {S32, PrivatePtr, 8, 8},
1251                                                   {S32, PrivatePtr, 16, 16},
1252                                                   {S32, ConstantPtr, 8, 8},
1253                                                   {S32, ConstantPtr, 16, 2 * 8}});
1254   if (ST.hasFlatAddressSpace()) {
1255     ExtLoads.legalForTypesWithMemDesc(
1256         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1257   }
1258 
1259   ExtLoads.clampScalar(0, S32, S32)
1260           .widenScalarToNextPow2(0)
1261           .unsupportedIfMemSizeNotPow2()
1262           .lower();
1263 
1264   auto &Atomics = getActionDefinitionsBuilder(
1265     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1266      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1267      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1268      G_ATOMICRMW_UMIN})
1269     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1270                {S64, GlobalPtr}, {S64, LocalPtr},
1271                {S32, RegionPtr}, {S64, RegionPtr}});
1272   if (ST.hasFlatAddressSpace()) {
1273     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1274   }
1275 
1276   if (ST.hasLDSFPAtomics()) {
1277     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1278       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1279   }
1280 
1281   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1282   // demarshalling
1283   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1284     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1285                 {S32, FlatPtr}, {S64, FlatPtr}})
1286     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1287                {S32, RegionPtr}, {S64, RegionPtr}});
1288   // TODO: Pointer types, any 32-bit or 64-bit vector
1289 
1290   // Condition should be s32 for scalar, s1 for vector.
1291   getActionDefinitionsBuilder(G_SELECT)
1292     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1293           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1294           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1295     .clampScalar(0, S16, S64)
1296     .scalarize(1)
1297     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1298     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1299     .clampMaxNumElements(0, S32, 2)
1300     .clampMaxNumElements(0, LocalPtr, 2)
1301     .clampMaxNumElements(0, PrivatePtr, 2)
1302     .scalarize(0)
1303     .widenScalarToNextPow2(0)
1304     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1305 
1306   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1307   // be more flexible with the shift amount type.
1308   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1309     .legalFor({{S32, S32}, {S64, S32}});
1310   if (ST.has16BitInsts()) {
1311     if (ST.hasVOP3PInsts()) {
1312       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1313             .clampMaxNumElements(0, S16, 2);
1314     } else
1315       Shifts.legalFor({{S16, S16}});
1316 
1317     // TODO: Support 16-bit shift amounts for all types
1318     Shifts.widenScalarIf(
1319       [=](const LegalityQuery &Query) {
1320         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1321         // 32-bit amount.
1322         const LLT ValTy = Query.Types[0];
1323         const LLT AmountTy = Query.Types[1];
1324         return ValTy.getSizeInBits() <= 16 &&
1325                AmountTy.getSizeInBits() < 16;
1326       }, changeTo(1, S16));
1327     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1328     Shifts.clampScalar(1, S32, S32);
1329     Shifts.clampScalar(0, S16, S64);
1330     Shifts.widenScalarToNextPow2(0, 16);
1331 
1332     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1333       .minScalar(0, S16)
1334       .scalarize(0)
1335       .lower();
1336   } else {
1337     // Make sure we legalize the shift amount type first, as the general
1338     // expansion for the shifted type will produce much worse code if it hasn't
1339     // been truncated already.
1340     Shifts.clampScalar(1, S32, S32);
1341     Shifts.clampScalar(0, S32, S64);
1342     Shifts.widenScalarToNextPow2(0, 32);
1343 
1344     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1345       .minScalar(0, S32)
1346       .scalarize(0)
1347       .lower();
1348   }
1349   Shifts.scalarize(0);
1350 
1351   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1352     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1353     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1354     unsigned IdxTypeIdx = 2;
1355 
1356     getActionDefinitionsBuilder(Op)
1357       .customIf([=](const LegalityQuery &Query) {
1358           const LLT EltTy = Query.Types[EltTypeIdx];
1359           const LLT VecTy = Query.Types[VecTypeIdx];
1360           const LLT IdxTy = Query.Types[IdxTypeIdx];
1361           const unsigned EltSize = EltTy.getSizeInBits();
1362           return (EltSize == 32 || EltSize == 64) &&
1363                   VecTy.getSizeInBits() % 32 == 0 &&
1364                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1365                   IdxTy.getSizeInBits() == 32;
1366         })
1367       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1368                  bitcastToVectorElement32(VecTypeIdx))
1369       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1370       .bitcastIf(
1371         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1372         [=](const LegalityQuery &Query) {
1373           // For > 64-bit element types, try to turn this into a 64-bit
1374           // element vector since we may be able to do better indexing
1375           // if this is scalar. If not, fall back to 32.
1376           const LLT EltTy = Query.Types[EltTypeIdx];
1377           const LLT VecTy = Query.Types[VecTypeIdx];
1378           const unsigned DstEltSize = EltTy.getSizeInBits();
1379           const unsigned VecSize = VecTy.getSizeInBits();
1380 
1381           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1382           return std::make_pair(
1383             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1384         })
1385       .clampScalar(EltTypeIdx, S32, S64)
1386       .clampScalar(VecTypeIdx, S32, S64)
1387       .clampScalar(IdxTypeIdx, S32, S32)
1388       .clampMaxNumElements(VecTypeIdx, S32, 32)
1389       // TODO: Clamp elements for 64-bit vectors?
1390       // It should only be necessary with variable indexes.
1391       // As a last resort, lower to the stack
1392       .lower();
1393   }
1394 
1395   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1396     .unsupportedIf([=](const LegalityQuery &Query) {
1397         const LLT &EltTy = Query.Types[1].getElementType();
1398         return Query.Types[0] != EltTy;
1399       });
1400 
1401   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1402     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1403     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1404 
1405     // FIXME: Doesn't handle extract of illegal sizes.
1406     getActionDefinitionsBuilder(Op)
1407       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1408       // FIXME: Multiples of 16 should not be legal.
1409       .legalIf([=](const LegalityQuery &Query) {
1410           const LLT BigTy = Query.Types[BigTyIdx];
1411           const LLT LitTy = Query.Types[LitTyIdx];
1412           return (BigTy.getSizeInBits() % 32 == 0) &&
1413                  (LitTy.getSizeInBits() % 16 == 0);
1414         })
1415       .widenScalarIf(
1416         [=](const LegalityQuery &Query) {
1417           const LLT BigTy = Query.Types[BigTyIdx];
1418           return (BigTy.getScalarSizeInBits() < 16);
1419         },
1420         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1421       .widenScalarIf(
1422         [=](const LegalityQuery &Query) {
1423           const LLT LitTy = Query.Types[LitTyIdx];
1424           return (LitTy.getScalarSizeInBits() < 16);
1425         },
1426         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1427       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1428       .widenScalarToNextPow2(BigTyIdx, 32);
1429 
1430   }
1431 
1432   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1433     .legalForCartesianProduct(AllS32Vectors, {S32})
1434     .legalForCartesianProduct(AllS64Vectors, {S64})
1435     .clampNumElements(0, V16S32, V32S32)
1436     .clampNumElements(0, V2S64, V16S64)
1437     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1438 
1439   if (ST.hasScalarPackInsts()) {
1440     BuildVector
1441       // FIXME: Should probably widen s1 vectors straight to s32
1442       .minScalarOrElt(0, S16)
1443       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1444       .minScalar(1, S32);
1445 
1446     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1447       .legalFor({V2S16, S32})
1448       .lower();
1449     BuildVector.minScalarOrElt(0, S32);
1450   } else {
1451     BuildVector.customFor({V2S16, S16});
1452     BuildVector.minScalarOrElt(0, S32);
1453 
1454     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1455       .customFor({V2S16, S32})
1456       .lower();
1457   }
1458 
1459   BuildVector.legalIf(isRegisterType(0));
1460 
1461   // FIXME: Clamp maximum size
1462   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1463     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1464     .clampMaxNumElements(0, S32, 32)
1465     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1466     .clampMaxNumElements(0, S16, 64);
1467 
1468   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1469   // pre-legalize.
1470   if (ST.hasVOP3PInsts()) {
1471     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1472       .customFor({V2S16, V2S16})
1473       .lower();
1474   } else
1475     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1476 
1477   // Merge/Unmerge
1478   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1479     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1480     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1481 
1482     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1483       const LLT Ty = Query.Types[TypeIdx];
1484       if (Ty.isVector()) {
1485         const LLT &EltTy = Ty.getElementType();
1486         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1487           return true;
1488         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1489           return true;
1490       }
1491       return false;
1492     };
1493 
1494     auto &Builder = getActionDefinitionsBuilder(Op)
1495       .lowerFor({{S16, V2S16}})
1496       .lowerIf([=](const LegalityQuery &Query) {
1497           const LLT BigTy = Query.Types[BigTyIdx];
1498           return BigTy.getSizeInBits() == 32;
1499         })
1500       // Try to widen to s16 first for small types.
1501       // TODO: Only do this on targets with legal s16 shifts
1502       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1503       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1504       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1505       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1506                            elementTypeIs(1, S16)),
1507                        changeTo(1, V2S16))
1508       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1509       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1510       // valid.
1511       .clampScalar(LitTyIdx, S32, S512)
1512       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1513       // Break up vectors with weird elements into scalars
1514       .fewerElementsIf(
1515         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1516         scalarize(0))
1517       .fewerElementsIf(
1518         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1519         scalarize(1))
1520       .clampScalar(BigTyIdx, S32, MaxScalar);
1521 
1522     if (Op == G_MERGE_VALUES) {
1523       Builder.widenScalarIf(
1524         // TODO: Use 16-bit shifts if legal for 8-bit values?
1525         [=](const LegalityQuery &Query) {
1526           const LLT Ty = Query.Types[LitTyIdx];
1527           return Ty.getSizeInBits() < 32;
1528         },
1529         changeTo(LitTyIdx, S32));
1530     }
1531 
1532     Builder.widenScalarIf(
1533       [=](const LegalityQuery &Query) {
1534         const LLT Ty = Query.Types[BigTyIdx];
1535         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1536           Ty.getSizeInBits() % 16 != 0;
1537       },
1538       [=](const LegalityQuery &Query) {
1539         // Pick the next power of 2, or a multiple of 64 over 128.
1540         // Whichever is smaller.
1541         const LLT &Ty = Query.Types[BigTyIdx];
1542         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1543         if (NewSizeInBits >= 256) {
1544           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1545           if (RoundedTo < NewSizeInBits)
1546             NewSizeInBits = RoundedTo;
1547         }
1548         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1549       })
1550       .legalIf([=](const LegalityQuery &Query) {
1551           const LLT &BigTy = Query.Types[BigTyIdx];
1552           const LLT &LitTy = Query.Types[LitTyIdx];
1553 
1554           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1555             return false;
1556           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1557             return false;
1558 
1559           return BigTy.getSizeInBits() % 16 == 0 &&
1560                  LitTy.getSizeInBits() % 16 == 0 &&
1561                  BigTy.getSizeInBits() <= MaxRegisterSize;
1562         })
1563       // Any vectors left are the wrong size. Scalarize them.
1564       .scalarize(0)
1565       .scalarize(1);
1566   }
1567 
1568   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1569   // RegBankSelect.
1570   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1571     .legalFor({{S32}, {S64}});
1572 
1573   if (ST.hasVOP3PInsts()) {
1574     SextInReg.lowerFor({{V2S16}})
1575       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1576       // get more vector shift opportunities, since we'll get those when
1577       // expanded.
1578       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1579   } else if (ST.has16BitInsts()) {
1580     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1581   } else {
1582     // Prefer to promote to s32 before lowering if we don't have 16-bit
1583     // shifts. This avoid a lot of intermediate truncate and extend operations.
1584     SextInReg.lowerFor({{S32}, {S64}});
1585   }
1586 
1587   SextInReg
1588     .scalarize(0)
1589     .clampScalar(0, S32, S64)
1590     .lower();
1591 
1592   getActionDefinitionsBuilder(G_FSHR)
1593     .legalFor({{S32, S32}})
1594     .scalarize(0)
1595     .lower();
1596 
1597   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1598     .legalFor({S64});
1599 
1600   getActionDefinitionsBuilder(G_FENCE)
1601     .alwaysLegal();
1602 
1603   getActionDefinitionsBuilder({
1604       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1605       G_FCOPYSIGN,
1606 
1607       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1608       G_ATOMICRMW_NAND,
1609       G_ATOMICRMW_FSUB,
1610       G_READ_REGISTER,
1611       G_WRITE_REGISTER,
1612 
1613       G_SADDO, G_SSUBO,
1614 
1615        // TODO: Implement
1616       G_FMINIMUM, G_FMAXIMUM,
1617       G_FSHL
1618     }).lower();
1619 
1620   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1621         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1622         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1623     .unsupported();
1624 
1625   computeTables();
1626   verify(*ST.getInstrInfo());
1627 }
1628 
1629 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1630                                          MachineInstr &MI) const {
1631   MachineIRBuilder &B = Helper.MIRBuilder;
1632   MachineRegisterInfo &MRI = *B.getMRI();
1633 
1634   switch (MI.getOpcode()) {
1635   case TargetOpcode::G_ADDRSPACE_CAST:
1636     return legalizeAddrSpaceCast(MI, MRI, B);
1637   case TargetOpcode::G_FRINT:
1638     return legalizeFrint(MI, MRI, B);
1639   case TargetOpcode::G_FCEIL:
1640     return legalizeFceil(MI, MRI, B);
1641   case TargetOpcode::G_FREM:
1642     return legalizeFrem(MI, MRI, B);
1643   case TargetOpcode::G_INTRINSIC_TRUNC:
1644     return legalizeIntrinsicTrunc(MI, MRI, B);
1645   case TargetOpcode::G_SITOFP:
1646     return legalizeITOFP(MI, MRI, B, true);
1647   case TargetOpcode::G_UITOFP:
1648     return legalizeITOFP(MI, MRI, B, false);
1649   case TargetOpcode::G_FPTOSI:
1650     return legalizeFPTOI(MI, MRI, B, true);
1651   case TargetOpcode::G_FPTOUI:
1652     return legalizeFPTOI(MI, MRI, B, false);
1653   case TargetOpcode::G_FMINNUM:
1654   case TargetOpcode::G_FMAXNUM:
1655   case TargetOpcode::G_FMINNUM_IEEE:
1656   case TargetOpcode::G_FMAXNUM_IEEE:
1657     return legalizeMinNumMaxNum(Helper, MI);
1658   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1659     return legalizeExtractVectorElt(MI, MRI, B);
1660   case TargetOpcode::G_INSERT_VECTOR_ELT:
1661     return legalizeInsertVectorElt(MI, MRI, B);
1662   case TargetOpcode::G_SHUFFLE_VECTOR:
1663     return legalizeShuffleVector(MI, MRI, B);
1664   case TargetOpcode::G_FSIN:
1665   case TargetOpcode::G_FCOS:
1666     return legalizeSinCos(MI, MRI, B);
1667   case TargetOpcode::G_GLOBAL_VALUE:
1668     return legalizeGlobalValue(MI, MRI, B);
1669   case TargetOpcode::G_LOAD:
1670     return legalizeLoad(Helper, MI);
1671   case TargetOpcode::G_FMAD:
1672     return legalizeFMad(MI, MRI, B);
1673   case TargetOpcode::G_FDIV:
1674     return legalizeFDIV(MI, MRI, B);
1675   case TargetOpcode::G_UDIV:
1676   case TargetOpcode::G_UREM:
1677     return legalizeUDIV_UREM(MI, MRI, B);
1678   case TargetOpcode::G_SDIV:
1679   case TargetOpcode::G_SREM:
1680     return legalizeSDIV_SREM(MI, MRI, B);
1681   case TargetOpcode::G_ATOMIC_CMPXCHG:
1682     return legalizeAtomicCmpXChg(MI, MRI, B);
1683   case TargetOpcode::G_FLOG:
1684     return legalizeFlog(MI, B, numbers::ln2f);
1685   case TargetOpcode::G_FLOG10:
1686     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1687   case TargetOpcode::G_FEXP:
1688     return legalizeFExp(MI, B);
1689   case TargetOpcode::G_FPOW:
1690     return legalizeFPow(MI, B);
1691   case TargetOpcode::G_FFLOOR:
1692     return legalizeFFloor(MI, MRI, B);
1693   case TargetOpcode::G_BUILD_VECTOR:
1694     return legalizeBuildVector(MI, MRI, B);
1695   default:
1696     return false;
1697   }
1698 
1699   llvm_unreachable("expected switch to return");
1700 }
1701 
1702 Register AMDGPULegalizerInfo::getSegmentAperture(
1703   unsigned AS,
1704   MachineRegisterInfo &MRI,
1705   MachineIRBuilder &B) const {
1706   MachineFunction &MF = B.getMF();
1707   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1708   const LLT S32 = LLT::scalar(32);
1709 
1710   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1711 
1712   if (ST.hasApertureRegs()) {
1713     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1714     // getreg.
1715     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1716         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1717         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1718     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1719         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1720         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1721     unsigned Encoding =
1722         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1723         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1724         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1725 
1726     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1727 
1728     B.buildInstr(AMDGPU::S_GETREG_B32)
1729       .addDef(GetReg)
1730       .addImm(Encoding);
1731     MRI.setType(GetReg, S32);
1732 
1733     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1734     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1735   }
1736 
1737   Register QueuePtr = MRI.createGenericVirtualRegister(
1738     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1739 
1740   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1741     return Register();
1742 
1743   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1744   // private_segment_aperture_base_hi.
1745   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1746 
1747   // TODO: can we be smarter about machine pointer info?
1748   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1749   MachineMemOperand *MMO = MF.getMachineMemOperand(
1750       PtrInfo,
1751       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1752           MachineMemOperand::MOInvariant,
1753       4, commonAlignment(Align(64), StructOffset));
1754 
1755   Register LoadAddr;
1756 
1757   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1758   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1759 }
1760 
1761 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1762   MachineInstr &MI, MachineRegisterInfo &MRI,
1763   MachineIRBuilder &B) const {
1764   MachineFunction &MF = B.getMF();
1765 
1766   const LLT S32 = LLT::scalar(32);
1767   Register Dst = MI.getOperand(0).getReg();
1768   Register Src = MI.getOperand(1).getReg();
1769 
1770   LLT DstTy = MRI.getType(Dst);
1771   LLT SrcTy = MRI.getType(Src);
1772   unsigned DestAS = DstTy.getAddressSpace();
1773   unsigned SrcAS = SrcTy.getAddressSpace();
1774 
1775   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1776   // vector element.
1777   assert(!DstTy.isVector());
1778 
1779   const AMDGPUTargetMachine &TM
1780     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1781 
1782   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1783     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1784     return true;
1785   }
1786 
1787   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788     // Truncate.
1789     B.buildExtract(Dst, Src, 0);
1790     MI.eraseFromParent();
1791     return true;
1792   }
1793 
1794   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1795     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1796     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1797 
1798     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1799     // another. Merge operands are required to be the same type, but creating an
1800     // extra ptrtoint would be kind of pointless.
1801     auto HighAddr = B.buildConstant(
1802       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1803     B.buildMerge(Dst, {Src, HighAddr});
1804     MI.eraseFromParent();
1805     return true;
1806   }
1807 
1808   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1809     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1810            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1811     unsigned NullVal = TM.getNullPointerValue(DestAS);
1812 
1813     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1814     auto FlatNull = B.buildConstant(SrcTy, 0);
1815 
1816     // Extract low 32-bits of the pointer.
1817     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1818 
1819     auto CmpRes =
1820         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1821     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1822 
1823     MI.eraseFromParent();
1824     return true;
1825   }
1826 
1827   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1828     return false;
1829 
1830   if (!ST.hasFlatAddressSpace())
1831     return false;
1832 
1833   auto SegmentNull =
1834       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1835   auto FlatNull =
1836       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1837 
1838   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1839   if (!ApertureReg.isValid())
1840     return false;
1841 
1842   auto CmpRes =
1843       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1844 
1845   // Coerce the type of the low half of the result so we can use merge_values.
1846   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1847 
1848   // TODO: Should we allow mismatched types but matching sizes in merges to
1849   // avoid the ptrtoint?
1850   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1851   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1852 
1853   MI.eraseFromParent();
1854   return true;
1855 }
1856 
1857 bool AMDGPULegalizerInfo::legalizeFrint(
1858   MachineInstr &MI, MachineRegisterInfo &MRI,
1859   MachineIRBuilder &B) const {
1860   Register Src = MI.getOperand(1).getReg();
1861   LLT Ty = MRI.getType(Src);
1862   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1863 
1864   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1865   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1866 
1867   auto C1 = B.buildFConstant(Ty, C1Val);
1868   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1869 
1870   // TODO: Should this propagate fast-math-flags?
1871   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1872   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1873 
1874   auto C2 = B.buildFConstant(Ty, C2Val);
1875   auto Fabs = B.buildFAbs(Ty, Src);
1876 
1877   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1878   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeFceil(
1884   MachineInstr &MI, MachineRegisterInfo &MRI,
1885   MachineIRBuilder &B) const {
1886 
1887   const LLT S1 = LLT::scalar(1);
1888   const LLT S64 = LLT::scalar(64);
1889 
1890   Register Src = MI.getOperand(1).getReg();
1891   assert(MRI.getType(Src) == S64);
1892 
1893   // result = trunc(src)
1894   // if (src > 0.0 && src != result)
1895   //   result += 1.0
1896 
1897   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1898 
1899   const auto Zero = B.buildFConstant(S64, 0.0);
1900   const auto One = B.buildFConstant(S64, 1.0);
1901   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1902   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1903   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1904   auto Add = B.buildSelect(S64, And, One, Zero);
1905 
1906   // TODO: Should this propagate fast-math-flags?
1907   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFrem(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914     Register DstReg = MI.getOperand(0).getReg();
1915     Register Src0Reg = MI.getOperand(1).getReg();
1916     Register Src1Reg = MI.getOperand(2).getReg();
1917     auto Flags = MI.getFlags();
1918     LLT Ty = MRI.getType(DstReg);
1919 
1920     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1921     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1922     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1923     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1924     MI.eraseFromParent();
1925     return true;
1926 }
1927 
1928 static MachineInstrBuilder extractF64Exponent(Register Hi,
1929                                               MachineIRBuilder &B) {
1930   const unsigned FractBits = 52;
1931   const unsigned ExpBits = 11;
1932   LLT S32 = LLT::scalar(32);
1933 
1934   auto Const0 = B.buildConstant(S32, FractBits - 32);
1935   auto Const1 = B.buildConstant(S32, ExpBits);
1936 
1937   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1938     .addUse(Hi)
1939     .addUse(Const0.getReg(0))
1940     .addUse(Const1.getReg(0));
1941 
1942   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   const LLT S1 = LLT::scalar(1);
1949   const LLT S32 = LLT::scalar(32);
1950   const LLT S64 = LLT::scalar(64);
1951 
1952   Register Src = MI.getOperand(1).getReg();
1953   assert(MRI.getType(Src) == S64);
1954 
1955   // TODO: Should this use extract since the low half is unused?
1956   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1957   Register Hi = Unmerge.getReg(1);
1958 
1959   // Extract the upper half, since this is where we will find the sign and
1960   // exponent.
1961   auto Exp = extractF64Exponent(Hi, B);
1962 
1963   const unsigned FractBits = 52;
1964 
1965   // Extract the sign bit.
1966   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1967   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1968 
1969   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1970 
1971   const auto Zero32 = B.buildConstant(S32, 0);
1972 
1973   // Extend back to 64-bits.
1974   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1975 
1976   auto Shr = B.buildAShr(S64, FractMask, Exp);
1977   auto Not = B.buildNot(S64, Shr);
1978   auto Tmp0 = B.buildAnd(S64, Src, Not);
1979   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1980 
1981   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1982   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1983 
1984   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1985   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1986   MI.eraseFromParent();
1987   return true;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::legalizeITOFP(
1991   MachineInstr &MI, MachineRegisterInfo &MRI,
1992   MachineIRBuilder &B, bool Signed) const {
1993 
1994   Register Dst = MI.getOperand(0).getReg();
1995   Register Src = MI.getOperand(1).getReg();
1996 
1997   const LLT S64 = LLT::scalar(64);
1998   const LLT S32 = LLT::scalar(32);
1999 
2000   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2001 
2002   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2003 
2004   auto CvtHi = Signed ?
2005     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2006     B.buildUITOFP(S64, Unmerge.getReg(1));
2007 
2008   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2009 
2010   auto ThirtyTwo = B.buildConstant(S32, 32);
2011   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2012     .addUse(CvtHi.getReg(0))
2013     .addUse(ThirtyTwo.getReg(0));
2014 
2015   // TODO: Should this propagate fast-math-flags?
2016   B.buildFAdd(Dst, LdExp, CvtLo);
2017   MI.eraseFromParent();
2018   return true;
2019 }
2020 
2021 // TODO: Copied from DAG implementation. Verify logic and document how this
2022 // actually works.
2023 bool AMDGPULegalizerInfo::legalizeFPTOI(
2024   MachineInstr &MI, MachineRegisterInfo &MRI,
2025   MachineIRBuilder &B, bool Signed) const {
2026 
2027   Register Dst = MI.getOperand(0).getReg();
2028   Register Src = MI.getOperand(1).getReg();
2029 
2030   const LLT S64 = LLT::scalar(64);
2031   const LLT S32 = LLT::scalar(32);
2032 
2033   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2034 
2035   unsigned Flags = MI.getFlags();
2036 
2037   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2038   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2039   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2040 
2041   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2042   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2043   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2044 
2045   auto Hi = Signed ?
2046     B.buildFPTOSI(S32, FloorMul) :
2047     B.buildFPTOUI(S32, FloorMul);
2048   auto Lo = B.buildFPTOUI(S32, Fma);
2049 
2050   B.buildMerge(Dst, { Lo, Hi });
2051   MI.eraseFromParent();
2052 
2053   return true;
2054 }
2055 
2056 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2057                                                MachineInstr &MI) const {
2058   MachineFunction &MF = Helper.MIRBuilder.getMF();
2059   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2060 
2061   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2062                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2063 
2064   // With ieee_mode disabled, the instructions have the correct behavior
2065   // already for G_FMINNUM/G_FMAXNUM
2066   if (!MFI->getMode().IEEE)
2067     return !IsIEEEOp;
2068 
2069   if (IsIEEEOp)
2070     return true;
2071 
2072   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2073 }
2074 
2075 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2076   MachineInstr &MI, MachineRegisterInfo &MRI,
2077   MachineIRBuilder &B) const {
2078   // TODO: Should move some of this into LegalizerHelper.
2079 
2080   // TODO: Promote dynamic indexing of s16 to s32
2081 
2082   // FIXME: Artifact combiner probably should have replaced the truncated
2083   // constant before this, so we shouldn't need
2084   // getConstantVRegValWithLookThrough.
2085   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2086     MI.getOperand(2).getReg(), MRI);
2087   if (!IdxVal) // Dynamic case will be selected to register indexing.
2088     return true;
2089 
2090   Register Dst = MI.getOperand(0).getReg();
2091   Register Vec = MI.getOperand(1).getReg();
2092 
2093   LLT VecTy = MRI.getType(Vec);
2094   LLT EltTy = VecTy.getElementType();
2095   assert(EltTy == MRI.getType(Dst));
2096 
2097   if (IdxVal->Value < VecTy.getNumElements())
2098     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2099   else
2100     B.buildUndef(Dst);
2101 
2102   MI.eraseFromParent();
2103   return true;
2104 }
2105 
2106 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2107   MachineInstr &MI, MachineRegisterInfo &MRI,
2108   MachineIRBuilder &B) const {
2109   // TODO: Should move some of this into LegalizerHelper.
2110 
2111   // TODO: Promote dynamic indexing of s16 to s32
2112 
2113   // FIXME: Artifact combiner probably should have replaced the truncated
2114   // constant before this, so we shouldn't need
2115   // getConstantVRegValWithLookThrough.
2116   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2117     MI.getOperand(3).getReg(), MRI);
2118   if (!IdxVal) // Dynamic case will be selected to register indexing.
2119     return true;
2120 
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register Vec = MI.getOperand(1).getReg();
2123   Register Ins = MI.getOperand(2).getReg();
2124 
2125   LLT VecTy = MRI.getType(Vec);
2126   LLT EltTy = VecTy.getElementType();
2127   assert(EltTy == MRI.getType(Ins));
2128 
2129   if (IdxVal->Value < VecTy.getNumElements())
2130     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2131   else
2132     B.buildUndef(Dst);
2133 
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2139   MachineInstr &MI, MachineRegisterInfo &MRI,
2140   MachineIRBuilder &B) const {
2141   const LLT V2S16 = LLT::vector(2, 16);
2142 
2143   Register Dst = MI.getOperand(0).getReg();
2144   Register Src0 = MI.getOperand(1).getReg();
2145   LLT DstTy = MRI.getType(Dst);
2146   LLT SrcTy = MRI.getType(Src0);
2147 
2148   if (SrcTy == V2S16 && DstTy == V2S16 &&
2149       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2150     return true;
2151 
2152   MachineIRBuilder HelperBuilder(MI);
2153   GISelObserverWrapper DummyObserver;
2154   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2155   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeSinCos(
2159   MachineInstr &MI, MachineRegisterInfo &MRI,
2160   MachineIRBuilder &B) const {
2161 
2162   Register DstReg = MI.getOperand(0).getReg();
2163   Register SrcReg = MI.getOperand(1).getReg();
2164   LLT Ty = MRI.getType(DstReg);
2165   unsigned Flags = MI.getFlags();
2166 
2167   Register TrigVal;
2168   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2169   if (ST.hasTrigReducedRange()) {
2170     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2171     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2172       .addUse(MulVal.getReg(0))
2173       .setMIFlags(Flags).getReg(0);
2174   } else
2175     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2176 
2177   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2178     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2179   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2180     .addUse(TrigVal)
2181     .setMIFlags(Flags);
2182   MI.eraseFromParent();
2183   return true;
2184 }
2185 
2186 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2187                                                   MachineIRBuilder &B,
2188                                                   const GlobalValue *GV,
2189                                                   int64_t Offset,
2190                                                   unsigned GAFlags) const {
2191   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2192   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2193   // to the following code sequence:
2194   //
2195   // For constant address space:
2196   //   s_getpc_b64 s[0:1]
2197   //   s_add_u32 s0, s0, $symbol
2198   //   s_addc_u32 s1, s1, 0
2199   //
2200   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2201   //   a fixup or relocation is emitted to replace $symbol with a literal
2202   //   constant, which is a pc-relative offset from the encoding of the $symbol
2203   //   operand to the global variable.
2204   //
2205   // For global address space:
2206   //   s_getpc_b64 s[0:1]
2207   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2208   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2209   //
2210   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2211   //   fixups or relocations are emitted to replace $symbol@*@lo and
2212   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2213   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2214   //   operand to the global variable.
2215   //
2216   // What we want here is an offset from the value returned by s_getpc
2217   // (which is the address of the s_add_u32 instruction) to the global
2218   // variable, but since the encoding of $symbol starts 4 bytes after the start
2219   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2220   // small. This requires us to add 4 to the global variable offset in order to
2221   // compute the correct address.
2222 
2223   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2224 
2225   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2226     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2227 
2228   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2229     .addDef(PCReg);
2230 
2231   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2232   if (GAFlags == SIInstrInfo::MO_NONE)
2233     MIB.addImm(0);
2234   else
2235     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2236 
2237   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2238 
2239   if (PtrTy.getSizeInBits() == 32)
2240     B.buildExtract(DstReg, PCReg, 0);
2241   return true;
2242  }
2243 
2244 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2245   MachineInstr &MI, MachineRegisterInfo &MRI,
2246   MachineIRBuilder &B) const {
2247   Register DstReg = MI.getOperand(0).getReg();
2248   LLT Ty = MRI.getType(DstReg);
2249   unsigned AS = Ty.getAddressSpace();
2250 
2251   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2252   MachineFunction &MF = B.getMF();
2253   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2254 
2255   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2256     if (!MFI->isEntryFunction()) {
2257       const Function &Fn = MF.getFunction();
2258       DiagnosticInfoUnsupported BadLDSDecl(
2259         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2260         DS_Warning);
2261       Fn.getContext().diagnose(BadLDSDecl);
2262 
2263       // We currently don't have a way to correctly allocate LDS objects that
2264       // aren't directly associated with a kernel. We do force inlining of
2265       // functions that use local objects. However, if these dead functions are
2266       // not eliminated, we don't want a compile time error. Just emit a warning
2267       // and a trap, since there should be no callable path here.
2268       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2269       B.buildUndef(DstReg);
2270       MI.eraseFromParent();
2271       return true;
2272     }
2273 
2274     // TODO: We could emit code to handle the initialization somewhere.
2275     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2276       const SITargetLowering *TLI = ST.getTargetLowering();
2277       if (!TLI->shouldUseLDSConstAddress(GV)) {
2278         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2279         return true; // Leave in place;
2280       }
2281 
2282       B.buildConstant(
2283           DstReg,
2284           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2285       MI.eraseFromParent();
2286       return true;
2287     }
2288 
2289     const Function &Fn = MF.getFunction();
2290     DiagnosticInfoUnsupported BadInit(
2291       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2292     Fn.getContext().diagnose(BadInit);
2293     return true;
2294   }
2295 
2296   const SITargetLowering *TLI = ST.getTargetLowering();
2297 
2298   if (TLI->shouldEmitFixup(GV)) {
2299     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2300     MI.eraseFromParent();
2301     return true;
2302   }
2303 
2304   if (TLI->shouldEmitPCReloc(GV)) {
2305     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2306     MI.eraseFromParent();
2307     return true;
2308   }
2309 
2310   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2311   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2312 
2313   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2314       MachinePointerInfo::getGOT(MF),
2315       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2316           MachineMemOperand::MOInvariant,
2317       8 /*Size*/, Align(8));
2318 
2319   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2320 
2321   if (Ty.getSizeInBits() == 32) {
2322     // Truncate if this is a 32-bit constant adrdess.
2323     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2324     B.buildExtract(DstReg, Load, 0);
2325   } else
2326     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2327 
2328   MI.eraseFromParent();
2329   return true;
2330 }
2331 
2332 static LLT widenToNextPowerOf2(LLT Ty) {
2333   if (Ty.isVector())
2334     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2335   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2336 }
2337 
2338 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2339                                        MachineInstr &MI) const {
2340   MachineIRBuilder &B = Helper.MIRBuilder;
2341   MachineRegisterInfo &MRI = *B.getMRI();
2342   GISelChangeObserver &Observer = Helper.Observer;
2343 
2344   Register PtrReg = MI.getOperand(1).getReg();
2345   LLT PtrTy = MRI.getType(PtrReg);
2346   unsigned AddrSpace = PtrTy.getAddressSpace();
2347 
2348   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2349     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2350     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2351     Observer.changingInstr(MI);
2352     MI.getOperand(1).setReg(Cast.getReg(0));
2353     Observer.changedInstr(MI);
2354     return true;
2355   }
2356 
2357   Register ValReg = MI.getOperand(0).getReg();
2358   LLT ValTy = MRI.getType(ValReg);
2359 
2360   MachineMemOperand *MMO = *MI.memoperands_begin();
2361   const unsigned ValSize = ValTy.getSizeInBits();
2362   const unsigned MemSize = 8 * MMO->getSize();
2363   const Align MemAlign = MMO->getAlign();
2364   const unsigned AlignInBits = 8 * MemAlign.value();
2365 
2366   // Widen non-power-of-2 loads to the alignment if needed
2367   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2368     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2369 
2370     // This was already the correct extending load result type, so just adjust
2371     // the memory type.
2372     if (WideMemSize == ValSize) {
2373       MachineFunction &MF = B.getMF();
2374 
2375       MachineMemOperand *WideMMO =
2376           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2377       Observer.changingInstr(MI);
2378       MI.setMemRefs(MF, {WideMMO});
2379       Observer.changedInstr(MI);
2380       return true;
2381     }
2382 
2383     // Don't bother handling edge case that should probably never be produced.
2384     if (ValSize > WideMemSize)
2385       return false;
2386 
2387     LLT WideTy = widenToNextPowerOf2(ValTy);
2388 
2389     Register WideLoad;
2390     if (!WideTy.isVector()) {
2391       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2392       B.buildTrunc(ValReg, WideLoad).getReg(0);
2393     } else {
2394       // Extract the subvector.
2395 
2396       if (isRegisterType(ValTy)) {
2397         // If this a case where G_EXTRACT is legal, use it.
2398         // (e.g. <3 x s32> -> <4 x s32>)
2399         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2400         B.buildExtract(ValReg, WideLoad, 0);
2401       } else {
2402         // For cases where the widened type isn't a nice register value, unmerge
2403         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2404         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2405         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2406         B.setInsertPt(B.getMBB(), MI.getIterator());
2407         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2408       }
2409     }
2410 
2411     MI.eraseFromParent();
2412     return true;
2413   }
2414 
2415   return false;
2416 }
2417 
2418 bool AMDGPULegalizerInfo::legalizeFMad(
2419   MachineInstr &MI, MachineRegisterInfo &MRI,
2420   MachineIRBuilder &B) const {
2421   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2422   assert(Ty.isScalar());
2423 
2424   MachineFunction &MF = B.getMF();
2425   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2426 
2427   // TODO: Always legal with future ftz flag.
2428   // FIXME: Do we need just output?
2429   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2430     return true;
2431   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2432     return true;
2433 
2434   MachineIRBuilder HelperBuilder(MI);
2435   GISelObserverWrapper DummyObserver;
2436   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2437   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2438 }
2439 
2440 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2441   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2442   Register DstReg = MI.getOperand(0).getReg();
2443   Register PtrReg = MI.getOperand(1).getReg();
2444   Register CmpVal = MI.getOperand(2).getReg();
2445   Register NewVal = MI.getOperand(3).getReg();
2446 
2447   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2448          "this should not have been custom lowered");
2449 
2450   LLT ValTy = MRI.getType(CmpVal);
2451   LLT VecTy = LLT::vector(2, ValTy);
2452 
2453   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2454 
2455   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2456     .addDef(DstReg)
2457     .addUse(PtrReg)
2458     .addUse(PackedVal)
2459     .setMemRefs(MI.memoperands());
2460 
2461   MI.eraseFromParent();
2462   return true;
2463 }
2464 
2465 bool AMDGPULegalizerInfo::legalizeFlog(
2466   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2467   Register Dst = MI.getOperand(0).getReg();
2468   Register Src = MI.getOperand(1).getReg();
2469   LLT Ty = B.getMRI()->getType(Dst);
2470   unsigned Flags = MI.getFlags();
2471 
2472   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2473   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2474 
2475   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2476   MI.eraseFromParent();
2477   return true;
2478 }
2479 
2480 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2481                                        MachineIRBuilder &B) const {
2482   Register Dst = MI.getOperand(0).getReg();
2483   Register Src = MI.getOperand(1).getReg();
2484   unsigned Flags = MI.getFlags();
2485   LLT Ty = B.getMRI()->getType(Dst);
2486 
2487   auto K = B.buildFConstant(Ty, numbers::log2e);
2488   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2489   B.buildFExp2(Dst, Mul, Flags);
2490   MI.eraseFromParent();
2491   return true;
2492 }
2493 
2494 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2495                                        MachineIRBuilder &B) const {
2496   Register Dst = MI.getOperand(0).getReg();
2497   Register Src0 = MI.getOperand(1).getReg();
2498   Register Src1 = MI.getOperand(2).getReg();
2499   unsigned Flags = MI.getFlags();
2500   LLT Ty = B.getMRI()->getType(Dst);
2501   const LLT S16 = LLT::scalar(16);
2502   const LLT S32 = LLT::scalar(32);
2503 
2504   if (Ty == S32) {
2505     auto Log = B.buildFLog2(S32, Src0, Flags);
2506     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2507       .addUse(Log.getReg(0))
2508       .addUse(Src1)
2509       .setMIFlags(Flags);
2510     B.buildFExp2(Dst, Mul, Flags);
2511   } else if (Ty == S16) {
2512     // There's no f16 fmul_legacy, so we need to convert for it.
2513     auto Log = B.buildFLog2(S16, Src0, Flags);
2514     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2515     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2516     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2517       .addUse(Ext0.getReg(0))
2518       .addUse(Ext1.getReg(0))
2519       .setMIFlags(Flags);
2520 
2521     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2522   } else
2523     return false;
2524 
2525   MI.eraseFromParent();
2526   return true;
2527 }
2528 
2529 // Find a source register, ignoring any possible source modifiers.
2530 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2531   Register ModSrc = OrigSrc;
2532   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2533     ModSrc = SrcFNeg->getOperand(1).getReg();
2534     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2535       ModSrc = SrcFAbs->getOperand(1).getReg();
2536   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2537     ModSrc = SrcFAbs->getOperand(1).getReg();
2538   return ModSrc;
2539 }
2540 
2541 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2542                                          MachineRegisterInfo &MRI,
2543                                          MachineIRBuilder &B) const {
2544 
2545   const LLT S1 = LLT::scalar(1);
2546   const LLT S64 = LLT::scalar(64);
2547   Register Dst = MI.getOperand(0).getReg();
2548   Register OrigSrc = MI.getOperand(1).getReg();
2549   unsigned Flags = MI.getFlags();
2550   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2551          "this should not have been custom lowered");
2552 
2553   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2554   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2555   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2556   // V_FRACT bug is:
2557   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2558   //
2559   // Convert floor(x) to (x - fract(x))
2560 
2561   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2562     .addUse(OrigSrc)
2563     .setMIFlags(Flags);
2564 
2565   // Give source modifier matching some assistance before obscuring a foldable
2566   // pattern.
2567 
2568   // TODO: We can avoid the neg on the fract? The input sign to fract
2569   // shouldn't matter?
2570   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2571 
2572   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2573 
2574   Register Min = MRI.createGenericVirtualRegister(S64);
2575 
2576   // We don't need to concern ourselves with the snan handling difference, so
2577   // use the one which will directly select.
2578   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2579   if (MFI->getMode().IEEE)
2580     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2581   else
2582     B.buildFMinNum(Min, Fract, Const, Flags);
2583 
2584   Register CorrectedFract = Min;
2585   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2586     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2587     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2588   }
2589 
2590   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2591   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2592 
2593   MI.eraseFromParent();
2594   return true;
2595 }
2596 
2597 // Turn an illegal packed v2s16 build vector into bit operations.
2598 // TODO: This should probably be a bitcast action in LegalizerHelper.
2599 bool AMDGPULegalizerInfo::legalizeBuildVector(
2600   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2601   Register Dst = MI.getOperand(0).getReg();
2602   const LLT S32 = LLT::scalar(32);
2603   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2604 
2605   Register Src0 = MI.getOperand(1).getReg();
2606   Register Src1 = MI.getOperand(2).getReg();
2607   assert(MRI.getType(Src0) == LLT::scalar(16));
2608 
2609   auto Merge = B.buildMerge(S32, {Src0, Src1});
2610   B.buildBitcast(Dst, Merge);
2611 
2612   MI.eraseFromParent();
2613   return true;
2614 }
2615 
2616 // Return the use branch instruction, otherwise null if the usage is invalid.
2617 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2618                                        MachineRegisterInfo &MRI,
2619                                        MachineInstr *&Br,
2620                                        MachineBasicBlock *&UncondBrTarget) {
2621   Register CondDef = MI.getOperand(0).getReg();
2622   if (!MRI.hasOneNonDBGUse(CondDef))
2623     return nullptr;
2624 
2625   MachineBasicBlock *Parent = MI.getParent();
2626   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2627   if (UseMI.getParent() != Parent ||
2628       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2629     return nullptr;
2630 
2631   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2632   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2633   if (Next == Parent->end()) {
2634     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2635     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2636       return nullptr;
2637     UncondBrTarget = &*NextMBB;
2638   } else {
2639     if (Next->getOpcode() != AMDGPU::G_BR)
2640       return nullptr;
2641     Br = &*Next;
2642     UncondBrTarget = Br->getOperand(0).getMBB();
2643   }
2644 
2645   return &UseMI;
2646 }
2647 
2648 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2649                                          const ArgDescriptor *Arg,
2650                                          const TargetRegisterClass *ArgRC,
2651                                          LLT ArgTy) const {
2652   MCRegister SrcReg = Arg->getRegister();
2653   assert(SrcReg.isPhysical() && "Physical register expected");
2654   assert(DstReg.isVirtual() && "Virtual register expected");
2655 
2656   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2657                                              ArgTy);
2658   if (Arg->isMasked()) {
2659     // TODO: Should we try to emit this once in the entry block?
2660     const LLT S32 = LLT::scalar(32);
2661     const unsigned Mask = Arg->getMask();
2662     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2663 
2664     Register AndMaskSrc = LiveIn;
2665 
2666     if (Shift != 0) {
2667       auto ShiftAmt = B.buildConstant(S32, Shift);
2668       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2669     }
2670 
2671     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2672   } else {
2673     B.buildCopy(DstReg, LiveIn);
2674   }
2675 
2676   return true;
2677 }
2678 
2679 bool AMDGPULegalizerInfo::loadInputValue(
2680     Register DstReg, MachineIRBuilder &B,
2681     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2682   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2683   const ArgDescriptor *Arg;
2684   const TargetRegisterClass *ArgRC;
2685   LLT ArgTy;
2686   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2687 
2688   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2689     return false; // TODO: Handle these
2690   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2691 }
2692 
2693 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2694     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2695     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2696   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2697     return false;
2698 
2699   MI.eraseFromParent();
2700   return true;
2701 }
2702 
2703 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2704                                        MachineRegisterInfo &MRI,
2705                                        MachineIRBuilder &B) const {
2706   Register Dst = MI.getOperand(0).getReg();
2707   LLT DstTy = MRI.getType(Dst);
2708   LLT S16 = LLT::scalar(16);
2709   LLT S32 = LLT::scalar(32);
2710   LLT S64 = LLT::scalar(64);
2711 
2712   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2713     return true;
2714 
2715   if (DstTy == S16)
2716     return legalizeFDIV16(MI, MRI, B);
2717   if (DstTy == S32)
2718     return legalizeFDIV32(MI, MRI, B);
2719   if (DstTy == S64)
2720     return legalizeFDIV64(MI, MRI, B);
2721 
2722   return false;
2723 }
2724 
2725 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2726                                                   Register DstReg,
2727                                                   Register X,
2728                                                   Register Y,
2729                                                   bool IsDiv) const {
2730   const LLT S1 = LLT::scalar(1);
2731   const LLT S32 = LLT::scalar(32);
2732 
2733   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2734   // algorithm used here.
2735 
2736   // Initial estimate of inv(y).
2737   auto FloatY = B.buildUITOFP(S32, Y);
2738   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2739   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2740   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2741   auto Z = B.buildFPTOUI(S32, ScaledY);
2742 
2743   // One round of UNR.
2744   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2745   auto NegYZ = B.buildMul(S32, NegY, Z);
2746   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2747 
2748   // Quotient/remainder estimate.
2749   auto Q = B.buildUMulH(S32, X, Z);
2750   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2751 
2752   // First quotient/remainder refinement.
2753   auto One = B.buildConstant(S32, 1);
2754   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2755   if (IsDiv)
2756     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2757   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2758 
2759   // Second quotient/remainder refinement.
2760   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2761   if (IsDiv)
2762     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2763   else
2764     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2765 }
2766 
2767 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2768                                               MachineRegisterInfo &MRI,
2769                                               MachineIRBuilder &B) const {
2770   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2771   Register DstReg = MI.getOperand(0).getReg();
2772   Register Num = MI.getOperand(1).getReg();
2773   Register Den = MI.getOperand(2).getReg();
2774   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2775   MI.eraseFromParent();
2776   return true;
2777 }
2778 
2779 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2780 //
2781 // Return lo, hi of result
2782 //
2783 // %cvt.lo = G_UITOFP Val.lo
2784 // %cvt.hi = G_UITOFP Val.hi
2785 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2786 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2787 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2788 // %mul2 = G_FMUL %mul1, 2**(-32)
2789 // %trunc = G_INTRINSIC_TRUNC %mul2
2790 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2791 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2792 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2793                                                        Register Val) {
2794   const LLT S32 = LLT::scalar(32);
2795   auto Unmerge = B.buildUnmerge(S32, Val);
2796 
2797   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2798   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2799 
2800   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2801                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2802 
2803   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2804   auto Mul1 =
2805       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2806 
2807   // 2**(-32)
2808   auto Mul2 =
2809       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2810   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2811 
2812   // -(2**32)
2813   auto Mad2 = B.buildFMAD(S32, Trunc,
2814                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2815 
2816   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2817   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2818 
2819   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2820 }
2821 
2822 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2823                                                   Register DstReg,
2824                                                   Register Numer,
2825                                                   Register Denom,
2826                                                   bool IsDiv) const {
2827   const LLT S32 = LLT::scalar(32);
2828   const LLT S64 = LLT::scalar(64);
2829   const LLT S1 = LLT::scalar(1);
2830   Register RcpLo, RcpHi;
2831 
2832   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2833 
2834   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2835 
2836   auto Zero64 = B.buildConstant(S64, 0);
2837   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2838 
2839   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2840   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2841 
2842   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2843   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2844   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2845 
2846   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2847   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2848   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2849   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2850 
2851   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2852   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2853   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2854   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2855   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2856 
2857   auto Zero32 = B.buildConstant(S32, 0);
2858   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2859   auto Add2_HiC =
2860       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2861   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2862   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2863 
2864   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2865   Register NumerLo = UnmergeNumer.getReg(0);
2866   Register NumerHi = UnmergeNumer.getReg(1);
2867 
2868   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2869   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2870   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2871   Register Mul3_Lo = UnmergeMul3.getReg(0);
2872   Register Mul3_Hi = UnmergeMul3.getReg(1);
2873   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2874   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2875   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2876   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2877 
2878   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2879   Register DenomLo = UnmergeDenom.getReg(0);
2880   Register DenomHi = UnmergeDenom.getReg(1);
2881 
2882   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2883   auto C1 = B.buildSExt(S32, CmpHi);
2884 
2885   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2886   auto C2 = B.buildSExt(S32, CmpLo);
2887 
2888   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2889   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2890 
2891   // TODO: Here and below portions of the code can be enclosed into if/endif.
2892   // Currently control flow is unconditional and we have 4 selects after
2893   // potential endif to substitute PHIs.
2894 
2895   // if C3 != 0 ...
2896   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2897   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2898   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2899   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2900 
2901   auto One64 = B.buildConstant(S64, 1);
2902   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2903 
2904   auto C4 =
2905       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2906   auto C5 =
2907       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2908   auto C6 = B.buildSelect(
2909       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2910 
2911   // if (C6 != 0)
2912   auto Add4 = B.buildAdd(S64, Add3, One64);
2913   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2914 
2915   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2916   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2917   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2918 
2919   // endif C6
2920   // endif C3
2921 
2922   if (IsDiv) {
2923     auto Sel1 = B.buildSelect(
2924         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2925     B.buildSelect(DstReg,
2926                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2927   } else {
2928     auto Sel2 = B.buildSelect(
2929         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2930     B.buildSelect(DstReg,
2931                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2932   }
2933 }
2934 
2935 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2936                                             MachineRegisterInfo &MRI,
2937                                             MachineIRBuilder &B) const {
2938   const LLT S64 = LLT::scalar(64);
2939   const LLT S32 = LLT::scalar(32);
2940   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2941   Register DstReg = MI.getOperand(0).getReg();
2942   Register Num = MI.getOperand(1).getReg();
2943   Register Den = MI.getOperand(2).getReg();
2944   LLT Ty = MRI.getType(DstReg);
2945 
2946   if (Ty == S32)
2947     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2948   else if (Ty == S64)
2949     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2950   else
2951     return false;
2952 
2953   MI.eraseFromParent();
2954   return true;
2955 
2956 }
2957 
2958 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2959                                             MachineRegisterInfo &MRI,
2960                                             MachineIRBuilder &B) const {
2961   const LLT S64 = LLT::scalar(64);
2962   const LLT S32 = LLT::scalar(32);
2963 
2964   Register DstReg = MI.getOperand(0).getReg();
2965   const LLT Ty = MRI.getType(DstReg);
2966   if (Ty != S32 && Ty != S64)
2967     return false;
2968 
2969   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2970 
2971   Register LHS = MI.getOperand(1).getReg();
2972   Register RHS = MI.getOperand(2).getReg();
2973 
2974   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2975   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2976   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2977 
2978   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2979   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2980 
2981   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2982   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2983 
2984   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2985   if (Ty == S32)
2986     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2987   else
2988     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2989 
2990   Register Sign;
2991   if (IsDiv)
2992     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2993   else
2994     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2995 
2996   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2997   B.buildSub(DstReg, UDivRem, Sign);
2998 
2999   MI.eraseFromParent();
3000   return true;
3001 }
3002 
3003 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3004                                                  MachineRegisterInfo &MRI,
3005                                                  MachineIRBuilder &B) const {
3006   Register Res = MI.getOperand(0).getReg();
3007   Register LHS = MI.getOperand(1).getReg();
3008   Register RHS = MI.getOperand(2).getReg();
3009 
3010   uint16_t Flags = MI.getFlags();
3011 
3012   LLT ResTy = MRI.getType(Res);
3013   LLT S32 = LLT::scalar(32);
3014   LLT S64 = LLT::scalar(64);
3015 
3016   const MachineFunction &MF = B.getMF();
3017   bool Unsafe =
3018     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3019 
3020   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3021     return false;
3022 
3023   if (!Unsafe && ResTy == S32 &&
3024       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3025     return false;
3026 
3027   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3028     // 1 / x -> RCP(x)
3029     if (CLHS->isExactlyValue(1.0)) {
3030       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3031         .addUse(RHS)
3032         .setMIFlags(Flags);
3033 
3034       MI.eraseFromParent();
3035       return true;
3036     }
3037 
3038     // -1 / x -> RCP( FNEG(x) )
3039     if (CLHS->isExactlyValue(-1.0)) {
3040       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3041       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3042         .addUse(FNeg.getReg(0))
3043         .setMIFlags(Flags);
3044 
3045       MI.eraseFromParent();
3046       return true;
3047     }
3048   }
3049 
3050   // x / y -> x * (1.0 / y)
3051   if (Unsafe) {
3052     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3053       .addUse(RHS)
3054       .setMIFlags(Flags);
3055     B.buildFMul(Res, LHS, RCP, Flags);
3056 
3057     MI.eraseFromParent();
3058     return true;
3059   }
3060 
3061   return false;
3062 }
3063 
3064 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3065                                          MachineRegisterInfo &MRI,
3066                                          MachineIRBuilder &B) const {
3067   Register Res = MI.getOperand(0).getReg();
3068   Register LHS = MI.getOperand(1).getReg();
3069   Register RHS = MI.getOperand(2).getReg();
3070 
3071   uint16_t Flags = MI.getFlags();
3072 
3073   LLT S16 = LLT::scalar(16);
3074   LLT S32 = LLT::scalar(32);
3075 
3076   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3077   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3078 
3079   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3080     .addUse(RHSExt.getReg(0))
3081     .setMIFlags(Flags);
3082 
3083   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3084   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3085 
3086   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3087     .addUse(RDst.getReg(0))
3088     .addUse(RHS)
3089     .addUse(LHS)
3090     .setMIFlags(Flags);
3091 
3092   MI.eraseFromParent();
3093   return true;
3094 }
3095 
3096 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3097 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3098 static void toggleSPDenormMode(bool Enable,
3099                                MachineIRBuilder &B,
3100                                const GCNSubtarget &ST,
3101                                AMDGPU::SIModeRegisterDefaults Mode) {
3102   // Set SP denorm mode to this value.
3103   unsigned SPDenormMode =
3104     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3105 
3106   if (ST.hasDenormModeInst()) {
3107     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3108     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3109 
3110     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3111     B.buildInstr(AMDGPU::S_DENORM_MODE)
3112       .addImm(NewDenormModeValue);
3113 
3114   } else {
3115     // Select FP32 bit field in mode register.
3116     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3117                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3118                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3119 
3120     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3121       .addImm(SPDenormMode)
3122       .addImm(SPDenormModeBitField);
3123   }
3124 }
3125 
3126 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3127                                          MachineRegisterInfo &MRI,
3128                                          MachineIRBuilder &B) const {
3129   Register Res = MI.getOperand(0).getReg();
3130   Register LHS = MI.getOperand(1).getReg();
3131   Register RHS = MI.getOperand(2).getReg();
3132   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3133   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3134 
3135   uint16_t Flags = MI.getFlags();
3136 
3137   LLT S32 = LLT::scalar(32);
3138   LLT S1 = LLT::scalar(1);
3139 
3140   auto One = B.buildFConstant(S32, 1.0f);
3141 
3142   auto DenominatorScaled =
3143     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3144       .addUse(LHS)
3145       .addUse(RHS)
3146       .addImm(0)
3147       .setMIFlags(Flags);
3148   auto NumeratorScaled =
3149     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3150       .addUse(LHS)
3151       .addUse(RHS)
3152       .addImm(1)
3153       .setMIFlags(Flags);
3154 
3155   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3156     .addUse(DenominatorScaled.getReg(0))
3157     .setMIFlags(Flags);
3158   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3159 
3160   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3161   // aren't modeled as reading it.
3162   if (!Mode.allFP32Denormals())
3163     toggleSPDenormMode(true, B, ST, Mode);
3164 
3165   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3166   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3167   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3168   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3169   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3170   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3171 
3172   if (!Mode.allFP32Denormals())
3173     toggleSPDenormMode(false, B, ST, Mode);
3174 
3175   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3176     .addUse(Fma4.getReg(0))
3177     .addUse(Fma1.getReg(0))
3178     .addUse(Fma3.getReg(0))
3179     .addUse(NumeratorScaled.getReg(1))
3180     .setMIFlags(Flags);
3181 
3182   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3183     .addUse(Fmas.getReg(0))
3184     .addUse(RHS)
3185     .addUse(LHS)
3186     .setMIFlags(Flags);
3187 
3188   MI.eraseFromParent();
3189   return true;
3190 }
3191 
3192 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3193                                          MachineRegisterInfo &MRI,
3194                                          MachineIRBuilder &B) const {
3195   Register Res = MI.getOperand(0).getReg();
3196   Register LHS = MI.getOperand(1).getReg();
3197   Register RHS = MI.getOperand(2).getReg();
3198 
3199   uint16_t Flags = MI.getFlags();
3200 
3201   LLT S64 = LLT::scalar(64);
3202   LLT S1 = LLT::scalar(1);
3203 
3204   auto One = B.buildFConstant(S64, 1.0);
3205 
3206   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3207     .addUse(LHS)
3208     .addUse(RHS)
3209     .addImm(0)
3210     .setMIFlags(Flags);
3211 
3212   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3213 
3214   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3215     .addUse(DivScale0.getReg(0))
3216     .setMIFlags(Flags);
3217 
3218   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3219   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3220   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3221 
3222   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3223     .addUse(LHS)
3224     .addUse(RHS)
3225     .addImm(1)
3226     .setMIFlags(Flags);
3227 
3228   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3229   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3230   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3231 
3232   Register Scale;
3233   if (!ST.hasUsableDivScaleConditionOutput()) {
3234     // Workaround a hardware bug on SI where the condition output from div_scale
3235     // is not usable.
3236 
3237     LLT S32 = LLT::scalar(32);
3238 
3239     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3240     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3241     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3242     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3243 
3244     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3245                               Scale1Unmerge.getReg(1));
3246     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3247                               Scale0Unmerge.getReg(1));
3248     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3249   } else {
3250     Scale = DivScale1.getReg(1);
3251   }
3252 
3253   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3254     .addUse(Fma4.getReg(0))
3255     .addUse(Fma3.getReg(0))
3256     .addUse(Mul.getReg(0))
3257     .addUse(Scale)
3258     .setMIFlags(Flags);
3259 
3260   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3261     .addUse(Fmas.getReg(0))
3262     .addUse(RHS)
3263     .addUse(LHS)
3264     .setMIFlags(Flags);
3265 
3266   MI.eraseFromParent();
3267   return true;
3268 }
3269 
3270 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3271                                                  MachineRegisterInfo &MRI,
3272                                                  MachineIRBuilder &B) const {
3273   Register Res = MI.getOperand(0).getReg();
3274   Register LHS = MI.getOperand(2).getReg();
3275   Register RHS = MI.getOperand(3).getReg();
3276   uint16_t Flags = MI.getFlags();
3277 
3278   LLT S32 = LLT::scalar(32);
3279   LLT S1 = LLT::scalar(1);
3280 
3281   auto Abs = B.buildFAbs(S32, RHS, Flags);
3282   const APFloat C0Val(1.0f);
3283 
3284   auto C0 = B.buildConstant(S32, 0x6f800000);
3285   auto C1 = B.buildConstant(S32, 0x2f800000);
3286   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3287 
3288   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3289   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3290 
3291   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3292 
3293   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3294     .addUse(Mul0.getReg(0))
3295     .setMIFlags(Flags);
3296 
3297   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3298 
3299   B.buildFMul(Res, Sel, Mul1, Flags);
3300 
3301   MI.eraseFromParent();
3302   return true;
3303 }
3304 
3305 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3306 // FIXME: Why do we handle this one but not other removed instructions?
3307 //
3308 // Reciprocal square root.  The clamp prevents infinite results, clamping
3309 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3310 // +-max_float.
3311 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3312                                                     MachineRegisterInfo &MRI,
3313                                                     MachineIRBuilder &B) const {
3314   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3315     return true;
3316 
3317   Register Dst = MI.getOperand(0).getReg();
3318   Register Src = MI.getOperand(2).getReg();
3319   auto Flags = MI.getFlags();
3320 
3321   LLT Ty = MRI.getType(Dst);
3322 
3323   const fltSemantics *FltSemantics;
3324   if (Ty == LLT::scalar(32))
3325     FltSemantics = &APFloat::IEEEsingle();
3326   else if (Ty == LLT::scalar(64))
3327     FltSemantics = &APFloat::IEEEdouble();
3328   else
3329     return false;
3330 
3331   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3332     .addUse(Src)
3333     .setMIFlags(Flags);
3334 
3335   // We don't need to concern ourselves with the snan handling difference, since
3336   // the rsq quieted (or not) so use the one which will directly select.
3337   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3338   const bool UseIEEE = MFI->getMode().IEEE;
3339 
3340   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3341   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3342                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3343 
3344   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3345 
3346   if (UseIEEE)
3347     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3348   else
3349     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3350   MI.eraseFromParent();
3351   return true;
3352 }
3353 
3354 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3355   switch (IID) {
3356   case Intrinsic::amdgcn_ds_fadd:
3357     return AMDGPU::G_ATOMICRMW_FADD;
3358   case Intrinsic::amdgcn_ds_fmin:
3359     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3360   case Intrinsic::amdgcn_ds_fmax:
3361     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3362   default:
3363     llvm_unreachable("not a DS FP intrinsic");
3364   }
3365 }
3366 
3367 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3368                                                       MachineInstr &MI,
3369                                                       Intrinsic::ID IID) const {
3370   GISelChangeObserver &Observer = Helper.Observer;
3371   Observer.changingInstr(MI);
3372 
3373   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3374 
3375   // The remaining operands were used to set fields in the MemOperand on
3376   // construction.
3377   for (int I = 6; I > 3; --I)
3378     MI.RemoveOperand(I);
3379 
3380   MI.RemoveOperand(1); // Remove the intrinsic ID.
3381   Observer.changedInstr(MI);
3382   return true;
3383 }
3384 
3385 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3386                                             MachineRegisterInfo &MRI,
3387                                             MachineIRBuilder &B) const {
3388   uint64_t Offset =
3389     ST.getTargetLowering()->getImplicitParameterOffset(
3390       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3391   LLT DstTy = MRI.getType(DstReg);
3392   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3393 
3394   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3395   if (!loadInputValue(KernargPtrReg, B,
3396                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3397     return false;
3398 
3399   // FIXME: This should be nuw
3400   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3401   return true;
3402 }
3403 
3404 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3405                                                  MachineRegisterInfo &MRI,
3406                                                  MachineIRBuilder &B) const {
3407   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3408   if (!MFI->isEntryFunction()) {
3409     return legalizePreloadedArgIntrin(MI, MRI, B,
3410                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3411   }
3412 
3413   Register DstReg = MI.getOperand(0).getReg();
3414   if (!getImplicitArgPtr(DstReg, MRI, B))
3415     return false;
3416 
3417   MI.eraseFromParent();
3418   return true;
3419 }
3420 
3421 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3422                                               MachineRegisterInfo &MRI,
3423                                               MachineIRBuilder &B,
3424                                               unsigned AddrSpace) const {
3425   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3426   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3427   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3428   MI.eraseFromParent();
3429   return true;
3430 }
3431 
3432 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3433 // offset (the offset that is included in bounds checking and swizzling, to be
3434 // split between the instruction's voffset and immoffset fields) and soffset
3435 // (the offset that is excluded from bounds checking and swizzling, to go in
3436 // the instruction's soffset field).  This function takes the first kind of
3437 // offset and figures out how to split it between voffset and immoffset.
3438 std::tuple<Register, unsigned, unsigned>
3439 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3440                                         Register OrigOffset) const {
3441   const unsigned MaxImm = 4095;
3442   Register BaseReg;
3443   unsigned TotalConstOffset;
3444   MachineInstr *OffsetDef;
3445   const LLT S32 = LLT::scalar(32);
3446 
3447   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3448     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3449 
3450   unsigned ImmOffset = TotalConstOffset;
3451 
3452   // If the immediate value is too big for the immoffset field, put the value
3453   // and -4096 into the immoffset field so that the value that is copied/added
3454   // for the voffset field is a multiple of 4096, and it stands more chance
3455   // of being CSEd with the copy/add for another similar load/store.
3456   // However, do not do that rounding down to a multiple of 4096 if that is a
3457   // negative number, as it appears to be illegal to have a negative offset
3458   // in the vgpr, even if adding the immediate offset makes it positive.
3459   unsigned Overflow = ImmOffset & ~MaxImm;
3460   ImmOffset -= Overflow;
3461   if ((int32_t)Overflow < 0) {
3462     Overflow += ImmOffset;
3463     ImmOffset = 0;
3464   }
3465 
3466   if (Overflow != 0) {
3467     if (!BaseReg) {
3468       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3469     } else {
3470       auto OverflowVal = B.buildConstant(S32, Overflow);
3471       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3472     }
3473   }
3474 
3475   if (!BaseReg)
3476     BaseReg = B.buildConstant(S32, 0).getReg(0);
3477 
3478   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3479 }
3480 
3481 /// Handle register layout difference for f16 images for some subtargets.
3482 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3483                                              MachineRegisterInfo &MRI,
3484                                              Register Reg) const {
3485   if (!ST.hasUnpackedD16VMem())
3486     return Reg;
3487 
3488   const LLT S16 = LLT::scalar(16);
3489   const LLT S32 = LLT::scalar(32);
3490   LLT StoreVT = MRI.getType(Reg);
3491   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3492 
3493   auto Unmerge = B.buildUnmerge(S16, Reg);
3494 
3495   SmallVector<Register, 4> WideRegs;
3496   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3497     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3498 
3499   int NumElts = StoreVT.getNumElements();
3500 
3501   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3502 }
3503 
3504 Register AMDGPULegalizerInfo::fixStoreSourceType(
3505   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3506   MachineRegisterInfo *MRI = B.getMRI();
3507   LLT Ty = MRI->getType(VData);
3508 
3509   const LLT S16 = LLT::scalar(16);
3510 
3511   // Fixup illegal register types for i8 stores.
3512   if (Ty == LLT::scalar(8) || Ty == S16) {
3513     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3514     return AnyExt;
3515   }
3516 
3517   if (Ty.isVector()) {
3518     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3519       if (IsFormat)
3520         return handleD16VData(B, *MRI, VData);
3521     }
3522   }
3523 
3524   return VData;
3525 }
3526 
3527 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3528                                               MachineRegisterInfo &MRI,
3529                                               MachineIRBuilder &B,
3530                                               bool IsTyped,
3531                                               bool IsFormat) const {
3532   Register VData = MI.getOperand(1).getReg();
3533   LLT Ty = MRI.getType(VData);
3534   LLT EltTy = Ty.getScalarType();
3535   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3536   const LLT S32 = LLT::scalar(32);
3537 
3538   VData = fixStoreSourceType(B, VData, IsFormat);
3539   Register RSrc = MI.getOperand(2).getReg();
3540 
3541   MachineMemOperand *MMO = *MI.memoperands_begin();
3542   const int MemSize = MMO->getSize();
3543 
3544   unsigned ImmOffset;
3545   unsigned TotalOffset;
3546 
3547   // The typed intrinsics add an immediate after the registers.
3548   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3549 
3550   // The struct intrinsic variants add one additional operand over raw.
3551   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3552   Register VIndex;
3553   int OpOffset = 0;
3554   if (HasVIndex) {
3555     VIndex = MI.getOperand(3).getReg();
3556     OpOffset = 1;
3557   }
3558 
3559   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3560   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3561 
3562   unsigned Format = 0;
3563   if (IsTyped) {
3564     Format = MI.getOperand(5 + OpOffset).getImm();
3565     ++OpOffset;
3566   }
3567 
3568   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3569 
3570   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3571   if (TotalOffset != 0)
3572     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3573 
3574   unsigned Opc;
3575   if (IsTyped) {
3576     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3577                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3578   } else if (IsFormat) {
3579     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3580                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3581   } else {
3582     switch (MemSize) {
3583     case 1:
3584       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3585       break;
3586     case 2:
3587       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3588       break;
3589     default:
3590       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3591       break;
3592     }
3593   }
3594 
3595   if (!VIndex)
3596     VIndex = B.buildConstant(S32, 0).getReg(0);
3597 
3598   auto MIB = B.buildInstr(Opc)
3599     .addUse(VData)              // vdata
3600     .addUse(RSrc)               // rsrc
3601     .addUse(VIndex)             // vindex
3602     .addUse(VOffset)            // voffset
3603     .addUse(SOffset)            // soffset
3604     .addImm(ImmOffset);         // offset(imm)
3605 
3606   if (IsTyped)
3607     MIB.addImm(Format);
3608 
3609   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3610      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3611      .addMemOperand(MMO);
3612 
3613   MI.eraseFromParent();
3614   return true;
3615 }
3616 
3617 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3618                                              MachineRegisterInfo &MRI,
3619                                              MachineIRBuilder &B,
3620                                              bool IsFormat,
3621                                              bool IsTyped) const {
3622   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3623   MachineMemOperand *MMO = *MI.memoperands_begin();
3624   const int MemSize = MMO->getSize();
3625   const LLT S32 = LLT::scalar(32);
3626 
3627   Register Dst = MI.getOperand(0).getReg();
3628   Register RSrc = MI.getOperand(2).getReg();
3629 
3630   // The typed intrinsics add an immediate after the registers.
3631   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3632 
3633   // The struct intrinsic variants add one additional operand over raw.
3634   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3635   Register VIndex;
3636   int OpOffset = 0;
3637   if (HasVIndex) {
3638     VIndex = MI.getOperand(3).getReg();
3639     OpOffset = 1;
3640   }
3641 
3642   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3643   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3644 
3645   unsigned Format = 0;
3646   if (IsTyped) {
3647     Format = MI.getOperand(5 + OpOffset).getImm();
3648     ++OpOffset;
3649   }
3650 
3651   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3652   unsigned ImmOffset;
3653   unsigned TotalOffset;
3654 
3655   LLT Ty = MRI.getType(Dst);
3656   LLT EltTy = Ty.getScalarType();
3657   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3658   const bool Unpacked = ST.hasUnpackedD16VMem();
3659 
3660   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3661   if (TotalOffset != 0)
3662     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3663 
3664   unsigned Opc;
3665 
3666   if (IsTyped) {
3667     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3668                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3669   } else if (IsFormat) {
3670     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3671                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3672   } else {
3673     switch (MemSize) {
3674     case 1:
3675       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3676       break;
3677     case 2:
3678       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3679       break;
3680     default:
3681       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3682       break;
3683     }
3684   }
3685 
3686   Register LoadDstReg;
3687 
3688   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3689   LLT UnpackedTy = Ty.changeElementSize(32);
3690 
3691   if (IsExtLoad)
3692     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3693   else if (Unpacked && IsD16 && Ty.isVector())
3694     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3695   else
3696     LoadDstReg = Dst;
3697 
3698   if (!VIndex)
3699     VIndex = B.buildConstant(S32, 0).getReg(0);
3700 
3701   auto MIB = B.buildInstr(Opc)
3702     .addDef(LoadDstReg)         // vdata
3703     .addUse(RSrc)               // rsrc
3704     .addUse(VIndex)             // vindex
3705     .addUse(VOffset)            // voffset
3706     .addUse(SOffset)            // soffset
3707     .addImm(ImmOffset);         // offset(imm)
3708 
3709   if (IsTyped)
3710     MIB.addImm(Format);
3711 
3712   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3713      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3714      .addMemOperand(MMO);
3715 
3716   if (LoadDstReg != Dst) {
3717     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3718 
3719     // Widen result for extending loads was widened.
3720     if (IsExtLoad)
3721       B.buildTrunc(Dst, LoadDstReg);
3722     else {
3723       // Repack to original 16-bit vector result
3724       // FIXME: G_TRUNC should work, but legalization currently fails
3725       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3726       SmallVector<Register, 4> Repack;
3727       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3728         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3729       B.buildMerge(Dst, Repack);
3730     }
3731   }
3732 
3733   MI.eraseFromParent();
3734   return true;
3735 }
3736 
3737 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3738                                                MachineIRBuilder &B,
3739                                                bool IsInc) const {
3740   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3741                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3742   B.buildInstr(Opc)
3743     .addDef(MI.getOperand(0).getReg())
3744     .addUse(MI.getOperand(2).getReg())
3745     .addUse(MI.getOperand(3).getReg())
3746     .cloneMemRefs(MI);
3747   MI.eraseFromParent();
3748   return true;
3749 }
3750 
3751 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3752   switch (IntrID) {
3753   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3754   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3755     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3756   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3757   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3758     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3759   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3760   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3761     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3762   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3763   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3764     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3765   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3766   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3767     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3768   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3769   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3770     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3771   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3772   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3773     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3774   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3775   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3776     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3777   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3778   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3779     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3780   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3781   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3782     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3783   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3784   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3785     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3786   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3787   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3788     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3789   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3790   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3791     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3792   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3793   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3794     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3795   default:
3796     llvm_unreachable("unhandled atomic opcode");
3797   }
3798 }
3799 
3800 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3801                                                MachineIRBuilder &B,
3802                                                Intrinsic::ID IID) const {
3803   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3804                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3805   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3806 
3807   Register Dst;
3808 
3809   int OpOffset = 0;
3810   if (HasReturn) {
3811     // A few FP atomics do not support return values.
3812     Dst = MI.getOperand(0).getReg();
3813   } else {
3814     OpOffset = -1;
3815   }
3816 
3817   Register VData = MI.getOperand(2 + OpOffset).getReg();
3818   Register CmpVal;
3819 
3820   if (IsCmpSwap) {
3821     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3822     ++OpOffset;
3823   }
3824 
3825   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3826   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3827 
3828   // The struct intrinsic variants add one additional operand over raw.
3829   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3830   Register VIndex;
3831   if (HasVIndex) {
3832     VIndex = MI.getOperand(4 + OpOffset).getReg();
3833     ++OpOffset;
3834   }
3835 
3836   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3837   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3838   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3839 
3840   MachineMemOperand *MMO = *MI.memoperands_begin();
3841 
3842   unsigned ImmOffset;
3843   unsigned TotalOffset;
3844   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3845   if (TotalOffset != 0)
3846     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3847 
3848   if (!VIndex)
3849     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3850 
3851   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3852 
3853   if (HasReturn)
3854     MIB.addDef(Dst);
3855 
3856   MIB.addUse(VData); // vdata
3857 
3858   if (IsCmpSwap)
3859     MIB.addReg(CmpVal);
3860 
3861   MIB.addUse(RSrc)               // rsrc
3862      .addUse(VIndex)             // vindex
3863      .addUse(VOffset)            // voffset
3864      .addUse(SOffset)            // soffset
3865      .addImm(ImmOffset)          // offset(imm)
3866      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3867      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3868      .addMemOperand(MMO);
3869 
3870   MI.eraseFromParent();
3871   return true;
3872 }
3873 
3874 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3875 /// vector with s16 typed elements.
3876 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3877                                         SmallVectorImpl<Register> &PackedAddrs,
3878                                         int AddrIdx, int DimIdx, int EndIdx,
3879                                         int NumGradients) {
3880   const LLT S16 = LLT::scalar(16);
3881   const LLT V2S16 = LLT::vector(2, 16);
3882 
3883   for (int I = AddrIdx; I < EndIdx; ++I) {
3884     MachineOperand &SrcOp = MI.getOperand(I);
3885     if (!SrcOp.isReg())
3886       continue; // _L to _LZ may have eliminated this.
3887 
3888     Register AddrReg = SrcOp.getReg();
3889 
3890     if (I < DimIdx) {
3891       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3892       PackedAddrs.push_back(AddrReg);
3893     } else {
3894       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3895       // derivatives dx/dh and dx/dv are packed with undef.
3896       if (((I + 1) >= EndIdx) ||
3897           ((NumGradients / 2) % 2 == 1 &&
3898            (I == DimIdx + (NumGradients / 2) - 1 ||
3899             I == DimIdx + NumGradients - 1)) ||
3900           // Check for _L to _LZ optimization
3901           !MI.getOperand(I + 1).isReg()) {
3902         PackedAddrs.push_back(
3903             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3904                 .getReg(0));
3905       } else {
3906         PackedAddrs.push_back(
3907             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3908                 .getReg(0));
3909         ++I;
3910       }
3911     }
3912   }
3913 }
3914 
3915 /// Convert from separate vaddr components to a single vector address register,
3916 /// and replace the remaining operands with $noreg.
3917 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3918                                      int DimIdx, int NumVAddrs) {
3919   const LLT S32 = LLT::scalar(32);
3920 
3921   SmallVector<Register, 8> AddrRegs;
3922   for (int I = 0; I != NumVAddrs; ++I) {
3923     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3924     if (SrcOp.isReg()) {
3925       AddrRegs.push_back(SrcOp.getReg());
3926       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3927     }
3928   }
3929 
3930   int NumAddrRegs = AddrRegs.size();
3931   if (NumAddrRegs != 1) {
3932     // Round up to 8 elements for v5-v7
3933     // FIXME: Missing intermediate sized register classes and instructions.
3934     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3935       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3936       auto Undef = B.buildUndef(S32);
3937       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3938       NumAddrRegs = RoundedNumRegs;
3939     }
3940 
3941     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3942     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3943   }
3944 
3945   for (int I = 1; I != NumVAddrs; ++I) {
3946     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3947     if (SrcOp.isReg())
3948       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3949   }
3950 }
3951 
3952 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3953 ///
3954 /// Depending on the subtarget, load/store with 16-bit element data need to be
3955 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3956 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3957 /// registers.
3958 ///
3959 /// We don't want to directly select image instructions just yet, but also want
3960 /// to exposes all register repacking to the legalizer/combiners. We also don't
3961 /// want a selected instrution entering RegBankSelect. In order to avoid
3962 /// defining a multitude of intermediate image instructions, directly hack on
3963 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3964 /// now unnecessary arguments with $noreg.
3965 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3966     MachineInstr &MI, MachineIRBuilder &B,
3967     GISelChangeObserver &Observer,
3968     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3969 
3970   const int NumDefs = MI.getNumExplicitDefs();
3971   bool IsTFE = NumDefs == 2;
3972   // We are only processing the operands of d16 image operations on subtargets
3973   // that use the unpacked register layout, or need to repack the TFE result.
3974 
3975   // TODO: Do we need to guard against already legalized intrinsics?
3976   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3977     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3978 
3979   MachineRegisterInfo *MRI = B.getMRI();
3980   const LLT S32 = LLT::scalar(32);
3981   const LLT S16 = LLT::scalar(16);
3982   const LLT V2S16 = LLT::vector(2, 16);
3983 
3984   // Index of first address argument
3985   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3986 
3987   int NumVAddrs, NumGradients;
3988   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3989   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3990     getDMaskIdx(BaseOpcode, NumDefs);
3991   unsigned DMask = 0;
3992 
3993   // Check for 16 bit addresses and pack if true.
3994   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3995   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3996   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3997   const bool IsG16 = GradTy == S16;
3998   const bool IsA16 = AddrTy == S16;
3999 
4000   int DMaskLanes = 0;
4001   if (!BaseOpcode->Atomic) {
4002     DMask = MI.getOperand(DMaskIdx).getImm();
4003     if (BaseOpcode->Gather4) {
4004       DMaskLanes = 4;
4005     } else if (DMask != 0) {
4006       DMaskLanes = countPopulation(DMask);
4007     } else if (!IsTFE && !BaseOpcode->Store) {
4008       // If dmask is 0, this is a no-op load. This can be eliminated.
4009       B.buildUndef(MI.getOperand(0));
4010       MI.eraseFromParent();
4011       return true;
4012     }
4013   }
4014 
4015   Observer.changingInstr(MI);
4016   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4017 
4018   unsigned NewOpcode = NumDefs == 0 ?
4019     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4020 
4021   // Track that we legalized this
4022   MI.setDesc(B.getTII().get(NewOpcode));
4023 
4024   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4025   // dmask to be at least 1 otherwise the instruction will fail
4026   if (IsTFE && DMask == 0) {
4027     DMask = 0x1;
4028     DMaskLanes = 1;
4029     MI.getOperand(DMaskIdx).setImm(DMask);
4030   }
4031 
4032   if (BaseOpcode->Atomic) {
4033     Register VData0 = MI.getOperand(2).getReg();
4034     LLT Ty = MRI->getType(VData0);
4035 
4036     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4037     if (Ty.isVector())
4038       return false;
4039 
4040     if (BaseOpcode->AtomicX2) {
4041       Register VData1 = MI.getOperand(3).getReg();
4042       // The two values are packed in one register.
4043       LLT PackedTy = LLT::vector(2, Ty);
4044       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4045       MI.getOperand(2).setReg(Concat.getReg(0));
4046       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4047     }
4048   }
4049 
4050   int CorrectedNumVAddrs = NumVAddrs;
4051 
4052   // Optimize _L to _LZ when _L is zero
4053   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4054         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4055     const ConstantFP *ConstantLod;
4056     const int LodIdx = AddrIdx + NumVAddrs - 1;
4057 
4058     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4059       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4060         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4061         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4062           LZMappingInfo->LZ, ImageDimIntr->Dim);
4063 
4064         // The starting indexes should remain in the same place.
4065         --NumVAddrs;
4066         --CorrectedNumVAddrs;
4067 
4068         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4069           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4070         MI.RemoveOperand(LodIdx);
4071       }
4072     }
4073   }
4074 
4075   // Optimize _mip away, when 'lod' is zero
4076   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4077     int64_t ConstantLod;
4078     const int LodIdx = AddrIdx + NumVAddrs - 1;
4079 
4080     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4081       if (ConstantLod == 0) {
4082         // TODO: Change intrinsic opcode and remove operand instead or replacing
4083         // it with 0, as the _L to _LZ handling is done above.
4084         MI.getOperand(LodIdx).ChangeToImmediate(0);
4085         --CorrectedNumVAddrs;
4086       }
4087     }
4088   }
4089 
4090   // Rewrite the addressing register layout before doing anything else.
4091   if (IsA16 || IsG16) {
4092     if (IsA16) {
4093       // Target must support the feature and gradients need to be 16 bit too
4094       if (!ST.hasA16() || !IsG16)
4095         return false;
4096     } else if (!ST.hasG16())
4097       return false;
4098 
4099     if (NumVAddrs > 1) {
4100       SmallVector<Register, 4> PackedRegs;
4101       // Don't compress addresses for G16
4102       const int PackEndIdx =
4103           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4104       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4105                                   PackEndIdx, NumGradients);
4106 
4107       if (!IsA16) {
4108         // Add uncompressed address
4109         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4110           int AddrReg = MI.getOperand(I).getReg();
4111           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4112           PackedRegs.push_back(AddrReg);
4113         }
4114       }
4115 
4116       // See also below in the non-a16 branch
4117       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4118 
4119       if (!UseNSA && PackedRegs.size() > 1) {
4120         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4121         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4122         PackedRegs[0] = Concat.getReg(0);
4123         PackedRegs.resize(1);
4124       }
4125 
4126       const int NumPacked = PackedRegs.size();
4127       for (int I = 0; I != NumVAddrs; ++I) {
4128         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4129         if (!SrcOp.isReg()) {
4130           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4131           continue;
4132         }
4133 
4134         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4135 
4136         if (I < NumPacked)
4137           SrcOp.setReg(PackedRegs[I]);
4138         else
4139           SrcOp.setReg(AMDGPU::NoRegister);
4140       }
4141     }
4142   } else {
4143     // If the register allocator cannot place the address registers contiguously
4144     // without introducing moves, then using the non-sequential address encoding
4145     // is always preferable, since it saves VALU instructions and is usually a
4146     // wash in terms of code size or even better.
4147     //
4148     // However, we currently have no way of hinting to the register allocator
4149     // that MIMG addresses should be placed contiguously when it is possible to
4150     // do so, so force non-NSA for the common 2-address case as a heuristic.
4151     //
4152     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4153     // allocation when possible.
4154     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4155 
4156     if (!UseNSA && NumVAddrs > 1)
4157       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4158   }
4159 
4160   int Flags = 0;
4161   if (IsA16)
4162     Flags |= 1;
4163   if (IsG16)
4164     Flags |= 2;
4165   MI.addOperand(MachineOperand::CreateImm(Flags));
4166 
4167   if (BaseOpcode->Store) { // No TFE for stores?
4168     // TODO: Handle dmask trim
4169     Register VData = MI.getOperand(1).getReg();
4170     LLT Ty = MRI->getType(VData);
4171     if (!Ty.isVector() || Ty.getElementType() != S16)
4172       return true;
4173 
4174     Register RepackedReg = handleD16VData(B, *MRI, VData);
4175     if (RepackedReg != VData) {
4176       MI.getOperand(1).setReg(RepackedReg);
4177     }
4178 
4179     return true;
4180   }
4181 
4182   Register DstReg = MI.getOperand(0).getReg();
4183   LLT Ty = MRI->getType(DstReg);
4184   const LLT EltTy = Ty.getScalarType();
4185   const bool IsD16 = Ty.getScalarType() == S16;
4186   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4187 
4188   // Confirm that the return type is large enough for the dmask specified
4189   if (NumElts < DMaskLanes)
4190     return false;
4191 
4192   if (NumElts > 4 || DMaskLanes > 4)
4193     return false;
4194 
4195   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4196   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4197 
4198   // The raw dword aligned data component of the load. The only legal cases
4199   // where this matters should be when using the packed D16 format, for
4200   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4201   LLT RoundedTy;
4202 
4203   // S32 vector to to cover all data, plus TFE result element.
4204   LLT TFETy;
4205 
4206   // Register type to use for each loaded component. Will be S32 or V2S16.
4207   LLT RegTy;
4208 
4209   if (IsD16 && ST.hasUnpackedD16VMem()) {
4210     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4211     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4212     RegTy = S32;
4213   } else {
4214     unsigned EltSize = EltTy.getSizeInBits();
4215     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4216     unsigned RoundedSize = 32 * RoundedElts;
4217     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4218     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4219     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4220   }
4221 
4222   // The return type does not need adjustment.
4223   // TODO: Should we change s16 case to s32 or <2 x s16>?
4224   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4225     return true;
4226 
4227   Register Dst1Reg;
4228 
4229   // Insert after the instruction.
4230   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4231 
4232   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4233   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4234   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4235   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4236 
4237   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4238 
4239   MI.getOperand(0).setReg(NewResultReg);
4240 
4241   // In the IR, TFE is supposed to be used with a 2 element struct return
4242   // type. The intruction really returns these two values in one contiguous
4243   // register, with one additional dword beyond the loaded data. Rewrite the
4244   // return type to use a single register result.
4245 
4246   if (IsTFE) {
4247     Dst1Reg = MI.getOperand(1).getReg();
4248     if (MRI->getType(Dst1Reg) != S32)
4249       return false;
4250 
4251     // TODO: Make sure the TFE operand bit is set.
4252     MI.RemoveOperand(1);
4253 
4254     // Handle the easy case that requires no repack instructions.
4255     if (Ty == S32) {
4256       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4257       return true;
4258     }
4259   }
4260 
4261   // Now figure out how to copy the new result register back into the old
4262   // result.
4263   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4264 
4265   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4266 
4267   if (ResultNumRegs == 1) {
4268     assert(!IsTFE);
4269     ResultRegs[0] = NewResultReg;
4270   } else {
4271     // We have to repack into a new vector of some kind.
4272     for (int I = 0; I != NumDataRegs; ++I)
4273       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4274     B.buildUnmerge(ResultRegs, NewResultReg);
4275 
4276     // Drop the final TFE element to get the data part. The TFE result is
4277     // directly written to the right place already.
4278     if (IsTFE)
4279       ResultRegs.resize(NumDataRegs);
4280   }
4281 
4282   // For an s16 scalar result, we form an s32 result with a truncate regardless
4283   // of packed vs. unpacked.
4284   if (IsD16 && !Ty.isVector()) {
4285     B.buildTrunc(DstReg, ResultRegs[0]);
4286     return true;
4287   }
4288 
4289   // Avoid a build/concat_vector of 1 entry.
4290   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4291     B.buildBitcast(DstReg, ResultRegs[0]);
4292     return true;
4293   }
4294 
4295   assert(Ty.isVector());
4296 
4297   if (IsD16) {
4298     // For packed D16 results with TFE enabled, all the data components are
4299     // S32. Cast back to the expected type.
4300     //
4301     // TODO: We don't really need to use load s32 elements. We would only need one
4302     // cast for the TFE result if a multiple of v2s16 was used.
4303     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4304       for (Register &Reg : ResultRegs)
4305         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4306     } else if (ST.hasUnpackedD16VMem()) {
4307       for (Register &Reg : ResultRegs)
4308         Reg = B.buildTrunc(S16, Reg).getReg(0);
4309     }
4310   }
4311 
4312   auto padWithUndef = [&](LLT Ty, int NumElts) {
4313     if (NumElts == 0)
4314       return;
4315     Register Undef = B.buildUndef(Ty).getReg(0);
4316     for (int I = 0; I != NumElts; ++I)
4317       ResultRegs.push_back(Undef);
4318   };
4319 
4320   // Pad out any elements eliminated due to the dmask.
4321   LLT ResTy = MRI->getType(ResultRegs[0]);
4322   if (!ResTy.isVector()) {
4323     padWithUndef(ResTy, NumElts - ResultRegs.size());
4324     B.buildBuildVector(DstReg, ResultRegs);
4325     return true;
4326   }
4327 
4328   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4329   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4330 
4331   // Deal with the one annoying legal case.
4332   const LLT V3S16 = LLT::vector(3, 16);
4333   if (Ty == V3S16) {
4334     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4335     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4336     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4337     return true;
4338   }
4339 
4340   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4341   B.buildConcatVectors(DstReg, ResultRegs);
4342   return true;
4343 }
4344 
4345 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4346   LegalizerHelper &Helper, MachineInstr &MI) const {
4347   MachineIRBuilder &B = Helper.MIRBuilder;
4348   GISelChangeObserver &Observer = Helper.Observer;
4349 
4350   Register Dst = MI.getOperand(0).getReg();
4351   LLT Ty = B.getMRI()->getType(Dst);
4352   unsigned Size = Ty.getSizeInBits();
4353   MachineFunction &MF = B.getMF();
4354 
4355   Observer.changingInstr(MI);
4356 
4357   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4358     Ty = getBitcastRegisterType(Ty);
4359     Helper.bitcastDst(MI, Ty, 0);
4360     Dst = MI.getOperand(0).getReg();
4361     B.setInsertPt(B.getMBB(), MI);
4362   }
4363 
4364   // FIXME: We don't really need this intermediate instruction. The intrinsic
4365   // should be fixed to have a memory operand. Since it's readnone, we're not
4366   // allowed to add one.
4367   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4368   MI.RemoveOperand(1); // Remove intrinsic ID
4369 
4370   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4371   // TODO: Should this use datalayout alignment?
4372   const unsigned MemSize = (Size + 7) / 8;
4373   const Align MemAlign(4);
4374   MachineMemOperand *MMO = MF.getMachineMemOperand(
4375       MachinePointerInfo(),
4376       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4377           MachineMemOperand::MOInvariant,
4378       MemSize, MemAlign);
4379   MI.addMemOperand(MF, MMO);
4380 
4381   // There are no 96-bit result scalar loads, but widening to 128-bit should
4382   // always be legal. We may need to restore this to a 96-bit result if it turns
4383   // out this needs to be converted to a vector load during RegBankSelect.
4384   if (!isPowerOf2_32(Size)) {
4385     if (Ty.isVector())
4386       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4387     else
4388       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4389   }
4390 
4391   Observer.changedInstr(MI);
4392   return true;
4393 }
4394 
4395 // TODO: Move to selection
4396 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4397                                                 MachineRegisterInfo &MRI,
4398                                                 MachineIRBuilder &B) const {
4399   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4400   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4401       !ST.isTrapHandlerEnabled()) {
4402     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4403   } else {
4404     // Pass queue pointer to trap handler as input, and insert trap instruction
4405     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4406     MachineRegisterInfo &MRI = *B.getMRI();
4407 
4408     Register LiveIn =
4409       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4410     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4411       return false;
4412 
4413     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4414     B.buildCopy(SGPR01, LiveIn);
4415     B.buildInstr(AMDGPU::S_TRAP)
4416         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4417         .addReg(SGPR01, RegState::Implicit);
4418   }
4419 
4420   MI.eraseFromParent();
4421   return true;
4422 }
4423 
4424 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4425     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4426   // Is non-HSA path or trap-handler disabled? then, report a warning
4427   // accordingly
4428   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4429       !ST.isTrapHandlerEnabled()) {
4430     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4431                                      "debugtrap handler not supported",
4432                                      MI.getDebugLoc(), DS_Warning);
4433     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4434     Ctx.diagnose(NoTrap);
4435   } else {
4436     // Insert debug-trap instruction
4437     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4438   }
4439 
4440   MI.eraseFromParent();
4441   return true;
4442 }
4443 
4444 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4445                                             MachineInstr &MI) const {
4446   MachineIRBuilder &B = Helper.MIRBuilder;
4447   MachineRegisterInfo &MRI = *B.getMRI();
4448 
4449   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4450   auto IntrID = MI.getIntrinsicID();
4451   switch (IntrID) {
4452   case Intrinsic::amdgcn_if:
4453   case Intrinsic::amdgcn_else: {
4454     MachineInstr *Br = nullptr;
4455     MachineBasicBlock *UncondBrTarget = nullptr;
4456     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4457       const SIRegisterInfo *TRI
4458         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4459 
4460       Register Def = MI.getOperand(1).getReg();
4461       Register Use = MI.getOperand(3).getReg();
4462 
4463       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4464       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4465       if (IntrID == Intrinsic::amdgcn_if) {
4466         B.buildInstr(AMDGPU::SI_IF)
4467           .addDef(Def)
4468           .addUse(Use)
4469           .addMBB(UncondBrTarget);
4470       } else {
4471         B.buildInstr(AMDGPU::SI_ELSE)
4472           .addDef(Def)
4473           .addUse(Use)
4474           .addMBB(UncondBrTarget)
4475           .addImm(0);
4476       }
4477 
4478       if (Br) {
4479         Br->getOperand(0).setMBB(CondBrTarget);
4480       } else {
4481         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4482         // since we're swapping branch targets it needs to be reinserted.
4483         // FIXME: IRTranslator should probably not do this
4484         B.buildBr(*CondBrTarget);
4485       }
4486 
4487       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4488       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4489       MI.eraseFromParent();
4490       BrCond->eraseFromParent();
4491       return true;
4492     }
4493 
4494     return false;
4495   }
4496   case Intrinsic::amdgcn_loop: {
4497     MachineInstr *Br = nullptr;
4498     MachineBasicBlock *UncondBrTarget = nullptr;
4499     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4500       const SIRegisterInfo *TRI
4501         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4502 
4503       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4504       Register Reg = MI.getOperand(2).getReg();
4505 
4506       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4507       B.buildInstr(AMDGPU::SI_LOOP)
4508         .addUse(Reg)
4509         .addMBB(UncondBrTarget);
4510 
4511       if (Br)
4512         Br->getOperand(0).setMBB(CondBrTarget);
4513       else
4514         B.buildBr(*CondBrTarget);
4515 
4516       MI.eraseFromParent();
4517       BrCond->eraseFromParent();
4518       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4519       return true;
4520     }
4521 
4522     return false;
4523   }
4524   case Intrinsic::amdgcn_kernarg_segment_ptr:
4525     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4526       // This only makes sense to call in a kernel, so just lower to null.
4527       B.buildConstant(MI.getOperand(0).getReg(), 0);
4528       MI.eraseFromParent();
4529       return true;
4530     }
4531 
4532     return legalizePreloadedArgIntrin(
4533       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4534   case Intrinsic::amdgcn_implicitarg_ptr:
4535     return legalizeImplicitArgPtr(MI, MRI, B);
4536   case Intrinsic::amdgcn_workitem_id_x:
4537     return legalizePreloadedArgIntrin(MI, MRI, B,
4538                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4539   case Intrinsic::amdgcn_workitem_id_y:
4540     return legalizePreloadedArgIntrin(MI, MRI, B,
4541                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4542   case Intrinsic::amdgcn_workitem_id_z:
4543     return legalizePreloadedArgIntrin(MI, MRI, B,
4544                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4545   case Intrinsic::amdgcn_workgroup_id_x:
4546     return legalizePreloadedArgIntrin(MI, MRI, B,
4547                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4548   case Intrinsic::amdgcn_workgroup_id_y:
4549     return legalizePreloadedArgIntrin(MI, MRI, B,
4550                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4551   case Intrinsic::amdgcn_workgroup_id_z:
4552     return legalizePreloadedArgIntrin(MI, MRI, B,
4553                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4554   case Intrinsic::amdgcn_dispatch_ptr:
4555     return legalizePreloadedArgIntrin(MI, MRI, B,
4556                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4557   case Intrinsic::amdgcn_queue_ptr:
4558     return legalizePreloadedArgIntrin(MI, MRI, B,
4559                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4560   case Intrinsic::amdgcn_implicit_buffer_ptr:
4561     return legalizePreloadedArgIntrin(
4562       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4563   case Intrinsic::amdgcn_dispatch_id:
4564     return legalizePreloadedArgIntrin(MI, MRI, B,
4565                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4566   case Intrinsic::amdgcn_fdiv_fast:
4567     return legalizeFDIVFastIntrin(MI, MRI, B);
4568   case Intrinsic::amdgcn_is_shared:
4569     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4570   case Intrinsic::amdgcn_is_private:
4571     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4572   case Intrinsic::amdgcn_wavefrontsize: {
4573     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4574     MI.eraseFromParent();
4575     return true;
4576   }
4577   case Intrinsic::amdgcn_s_buffer_load:
4578     return legalizeSBufferLoad(Helper, MI);
4579   case Intrinsic::amdgcn_raw_buffer_store:
4580   case Intrinsic::amdgcn_struct_buffer_store:
4581     return legalizeBufferStore(MI, MRI, B, false, false);
4582   case Intrinsic::amdgcn_raw_buffer_store_format:
4583   case Intrinsic::amdgcn_struct_buffer_store_format:
4584     return legalizeBufferStore(MI, MRI, B, false, true);
4585   case Intrinsic::amdgcn_raw_tbuffer_store:
4586   case Intrinsic::amdgcn_struct_tbuffer_store:
4587     return legalizeBufferStore(MI, MRI, B, true, true);
4588   case Intrinsic::amdgcn_raw_buffer_load:
4589   case Intrinsic::amdgcn_struct_buffer_load:
4590     return legalizeBufferLoad(MI, MRI, B, false, false);
4591   case Intrinsic::amdgcn_raw_buffer_load_format:
4592   case Intrinsic::amdgcn_struct_buffer_load_format:
4593     return legalizeBufferLoad(MI, MRI, B, true, false);
4594   case Intrinsic::amdgcn_raw_tbuffer_load:
4595   case Intrinsic::amdgcn_struct_tbuffer_load:
4596     return legalizeBufferLoad(MI, MRI, B, true, true);
4597   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4598   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4599   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4600   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4601   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4602   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4603   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4604   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4605   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4606   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4607   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4608   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4609   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4610   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4611   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4612   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4613   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4614   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4615   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4616   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4617   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4618   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4619   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4620   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4621   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4622   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4623   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4624   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4625     return legalizeBufferAtomic(MI, B, IntrID);
4626   case Intrinsic::amdgcn_atomic_inc:
4627     return legalizeAtomicIncDec(MI, B, true);
4628   case Intrinsic::amdgcn_atomic_dec:
4629     return legalizeAtomicIncDec(MI, B, false);
4630   case Intrinsic::trap:
4631     return legalizeTrapIntrinsic(MI, MRI, B);
4632   case Intrinsic::debugtrap:
4633     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4634   case Intrinsic::amdgcn_rsq_clamp:
4635     return legalizeRsqClampIntrinsic(MI, MRI, B);
4636   case Intrinsic::amdgcn_ds_fadd:
4637   case Intrinsic::amdgcn_ds_fmin:
4638   case Intrinsic::amdgcn_ds_fmax:
4639     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4640   default: {
4641     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4642             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4643       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4644     return true;
4645   }
4646   }
4647 
4648   return true;
4649 }
4650