1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 /// Return true if we should legalize a load by widening an odd sized memory
364 /// access up to the alignment. Note this case when the memory access itself
365 /// changes, not the size of the result register.
366 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
367                             unsigned AlignInBits, unsigned AddrSpace,
368                             unsigned Opcode) {
369   // We don't want to widen cases that are naturally legal.
370   if (isPowerOf2_32(SizeInBits))
371     return false;
372 
373   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
374   // end up widening these for a scalar load during RegBankSelect, since there
375   // aren't 96-bit scalar loads.
376   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
377     return false;
378 
379   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
380     return false;
381 
382   // A load is known dereferenceable up to the alignment, so it's legal to widen
383   // to it.
384   //
385   // TODO: Could check dereferenceable for less aligned cases.
386   unsigned RoundedSize = NextPowerOf2(SizeInBits);
387   if (AlignInBits < RoundedSize)
388     return false;
389 
390   // Do not widen if it would introduce a slow unaligned load.
391   const SITargetLowering *TLI = ST.getTargetLowering();
392   bool Fast = false;
393   return TLI->allowsMisalignedMemoryAccessesImpl(
394              RoundedSize, AddrSpace, Align(AlignInBits / 8),
395              MachineMemOperand::MOLoad, &Fast) &&
396          Fast;
397 }
398 
399 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
400                             unsigned Opcode) {
401   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
402     return false;
403 
404   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
405                          Query.MMODescrs[0].AlignInBits,
406                          Query.Types[1].getAddressSpace(), Opcode);
407 }
408 
409 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
410                                          const GCNTargetMachine &TM)
411   :  ST(ST_) {
412   using namespace TargetOpcode;
413 
414   auto GetAddrSpacePtr = [&TM](unsigned AS) {
415     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
416   };
417 
418   const LLT S1 = LLT::scalar(1);
419   const LLT S16 = LLT::scalar(16);
420   const LLT S32 = LLT::scalar(32);
421   const LLT S64 = LLT::scalar(64);
422   const LLT S128 = LLT::scalar(128);
423   const LLT S256 = LLT::scalar(256);
424   const LLT S512 = LLT::scalar(512);
425   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
426 
427   const LLT V2S16 = LLT::vector(2, 16);
428   const LLT V4S16 = LLT::vector(4, 16);
429 
430   const LLT V2S32 = LLT::vector(2, 32);
431   const LLT V3S32 = LLT::vector(3, 32);
432   const LLT V4S32 = LLT::vector(4, 32);
433   const LLT V5S32 = LLT::vector(5, 32);
434   const LLT V6S32 = LLT::vector(6, 32);
435   const LLT V7S32 = LLT::vector(7, 32);
436   const LLT V8S32 = LLT::vector(8, 32);
437   const LLT V9S32 = LLT::vector(9, 32);
438   const LLT V10S32 = LLT::vector(10, 32);
439   const LLT V11S32 = LLT::vector(11, 32);
440   const LLT V12S32 = LLT::vector(12, 32);
441   const LLT V13S32 = LLT::vector(13, 32);
442   const LLT V14S32 = LLT::vector(14, 32);
443   const LLT V15S32 = LLT::vector(15, 32);
444   const LLT V16S32 = LLT::vector(16, 32);
445   const LLT V32S32 = LLT::vector(32, 32);
446 
447   const LLT V2S64 = LLT::vector(2, 64);
448   const LLT V3S64 = LLT::vector(3, 64);
449   const LLT V4S64 = LLT::vector(4, 64);
450   const LLT V5S64 = LLT::vector(5, 64);
451   const LLT V6S64 = LLT::vector(6, 64);
452   const LLT V7S64 = LLT::vector(7, 64);
453   const LLT V8S64 = LLT::vector(8, 64);
454   const LLT V16S64 = LLT::vector(16, 64);
455 
456   std::initializer_list<LLT> AllS32Vectors =
457     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
458      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
459   std::initializer_list<LLT> AllS64Vectors =
460     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
461 
462   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
463   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
464   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
465   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
466   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
467   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
468   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
469 
470   const LLT CodePtr = FlatPtr;
471 
472   const std::initializer_list<LLT> AddrSpaces64 = {
473     GlobalPtr, ConstantPtr, FlatPtr
474   };
475 
476   const std::initializer_list<LLT> AddrSpaces32 = {
477     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
478   };
479 
480   const std::initializer_list<LLT> FPTypesBase = {
481     S32, S64
482   };
483 
484   const std::initializer_list<LLT> FPTypes16 = {
485     S32, S64, S16
486   };
487 
488   const std::initializer_list<LLT> FPTypesPK16 = {
489     S32, S64, S16, V2S16
490   };
491 
492   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
493 
494   setAction({G_BRCOND, S1}, Legal); // VCC branches
495   setAction({G_BRCOND, S32}, Legal); // SCC branches
496 
497   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
498   // elements for v3s16
499   getActionDefinitionsBuilder(G_PHI)
500     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
501     .legalFor(AllS32Vectors)
502     .legalFor(AllS64Vectors)
503     .legalFor(AddrSpaces64)
504     .legalFor(AddrSpaces32)
505     .legalIf(isPointer(0))
506     .clampScalar(0, S16, S256)
507     .widenScalarToNextPow2(0, 32)
508     .clampMaxNumElements(0, S32, 16)
509     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
510     .scalarize(0);
511 
512   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
513     // Full set of gfx9 features.
514     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
515       .legalFor({S32, S16, V2S16})
516       .clampScalar(0, S16, S32)
517       .clampMaxNumElements(0, S16, 2)
518       .scalarize(0)
519       .widenScalarToNextPow2(0, 32);
520 
521     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
522       .legalFor({S32, S16, V2S16}) // Clamp modifier
523       .minScalarOrElt(0, S16)
524       .clampMaxNumElements(0, S16, 2)
525       .scalarize(0)
526       .widenScalarToNextPow2(0, 32)
527       .lower();
528   } else if (ST.has16BitInsts()) {
529     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
530       .legalFor({S32, S16})
531       .clampScalar(0, S16, S32)
532       .scalarize(0)
533       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
534 
535     // Technically the saturating operations require clamp bit support, but this
536     // was introduced at the same time as 16-bit operations.
537     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
538       .legalFor({S32, S16}) // Clamp modifier
539       .minScalar(0, S16)
540       .scalarize(0)
541       .widenScalarToNextPow2(0, 16)
542       .lower();
543 
544     // We're just lowering this, but it helps get a better result to try to
545     // coerce to the desired type first.
546     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
547       .minScalar(0, S16)
548       .scalarize(0)
549       .lower();
550   } else {
551     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
552       .legalFor({S32})
553       .clampScalar(0, S32, S32)
554       .scalarize(0);
555 
556     if (ST.hasIntClamp()) {
557       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
558         .legalFor({S32}) // Clamp modifier.
559         .scalarize(0)
560         .minScalarOrElt(0, S32)
561         .lower();
562     } else {
563       // Clamp bit support was added in VI, along with 16-bit operations.
564       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
565         .minScalar(0, S32)
566         .scalarize(0)
567         .lower();
568     }
569 
570     // FIXME: DAG expansion gets better results. The widening uses the smaller
571     // range values and goes for the min/max lowering directly.
572     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
573       .minScalar(0, S32)
574       .scalarize(0)
575       .lower();
576   }
577 
578   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
579     .customFor({S32, S64})
580     .clampScalar(0, S32, S64)
581     .widenScalarToNextPow2(0, 32)
582     .scalarize(0);
583 
584   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
585     .legalFor({S32})
586     .clampScalar(0, S32, S32)
587     .scalarize(0);
588 
589   // Report legal for any types we can handle anywhere. For the cases only legal
590   // on the SALU, RegBankSelect will be able to re-legalize.
591   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
592     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
593     .clampScalar(0, S32, S64)
594     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
595     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
596     .widenScalarToNextPow2(0)
597     .scalarize(0);
598 
599   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
600                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
601     .legalFor({{S32, S1}, {S32, S32}})
602     .minScalar(0, S32)
603     // TODO: .scalarize(0)
604     .lower();
605 
606   getActionDefinitionsBuilder(G_BITCAST)
607     // Don't worry about the size constraint.
608     .legalIf(all(isRegisterType(0), isRegisterType(1)))
609     .lower();
610 
611 
612   getActionDefinitionsBuilder(G_CONSTANT)
613     .legalFor({S1, S32, S64, S16, GlobalPtr,
614                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
615     .legalIf(isPointer(0))
616     .clampScalar(0, S32, S64)
617     .widenScalarToNextPow2(0);
618 
619   getActionDefinitionsBuilder(G_FCONSTANT)
620     .legalFor({S32, S64, S16})
621     .clampScalar(0, S16, S64);
622 
623   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
624       .legalIf(isRegisterType(0))
625       // s1 and s16 are special cases because they have legal operations on
626       // them, but don't really occupy registers in the normal way.
627       .legalFor({S1, S16})
628       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
629       .clampScalarOrElt(0, S32, MaxScalar)
630       .widenScalarToNextPow2(0, 32)
631       .clampMaxNumElements(0, S32, 16);
632 
633   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
634 
635   // If the amount is divergent, we have to do a wave reduction to get the
636   // maximum value, so this is expanded during RegBankSelect.
637   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
638     .legalFor({{PrivatePtr, S32}});
639 
640   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
641     .customIf(typeIsNot(0, PrivatePtr));
642 
643   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
644 
645   auto &FPOpActions = getActionDefinitionsBuilder(
646     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
647     .legalFor({S32, S64});
648   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
649     .customFor({S32, S64});
650   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
651     .customFor({S32, S64});
652 
653   if (ST.has16BitInsts()) {
654     if (ST.hasVOP3PInsts())
655       FPOpActions.legalFor({S16, V2S16});
656     else
657       FPOpActions.legalFor({S16});
658 
659     TrigActions.customFor({S16});
660     FDIVActions.customFor({S16});
661   }
662 
663   auto &MinNumMaxNum = getActionDefinitionsBuilder({
664       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
665 
666   if (ST.hasVOP3PInsts()) {
667     MinNumMaxNum.customFor(FPTypesPK16)
668       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
669       .clampMaxNumElements(0, S16, 2)
670       .clampScalar(0, S16, S64)
671       .scalarize(0);
672   } else if (ST.has16BitInsts()) {
673     MinNumMaxNum.customFor(FPTypes16)
674       .clampScalar(0, S16, S64)
675       .scalarize(0);
676   } else {
677     MinNumMaxNum.customFor(FPTypesBase)
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   }
681 
682   if (ST.hasVOP3PInsts())
683     FPOpActions.clampMaxNumElements(0, S16, 2);
684 
685   FPOpActions
686     .scalarize(0)
687     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
688 
689   TrigActions
690     .scalarize(0)
691     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
692 
693   FDIVActions
694     .scalarize(0)
695     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
696 
697   getActionDefinitionsBuilder({G_FNEG, G_FABS})
698     .legalFor(FPTypesPK16)
699     .clampMaxNumElements(0, S16, 2)
700     .scalarize(0)
701     .clampScalar(0, S16, S64);
702 
703   if (ST.has16BitInsts()) {
704     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
705       .legalFor({S32, S64, S16})
706       .scalarize(0)
707       .clampScalar(0, S16, S64);
708   } else {
709     getActionDefinitionsBuilder(G_FSQRT)
710       .legalFor({S32, S64})
711       .scalarize(0)
712       .clampScalar(0, S32, S64);
713 
714     if (ST.hasFractBug()) {
715       getActionDefinitionsBuilder(G_FFLOOR)
716         .customFor({S64})
717         .legalFor({S32, S64})
718         .scalarize(0)
719         .clampScalar(0, S32, S64);
720     } else {
721       getActionDefinitionsBuilder(G_FFLOOR)
722         .legalFor({S32, S64})
723         .scalarize(0)
724         .clampScalar(0, S32, S64);
725     }
726   }
727 
728   getActionDefinitionsBuilder(G_FPTRUNC)
729     .legalFor({{S32, S64}, {S16, S32}})
730     .scalarize(0)
731     .lower();
732 
733   getActionDefinitionsBuilder(G_FPEXT)
734     .legalFor({{S64, S32}, {S32, S16}})
735     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
736     .scalarize(0);
737 
738   getActionDefinitionsBuilder(G_FSUB)
739       // Use actual fsub instruction
740       .legalFor({S32})
741       // Must use fadd + fneg
742       .lowerFor({S64, S16, V2S16})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746   // Whether this is legal depends on the floating point mode for the function.
747   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
748   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
749     FMad.customFor({S32, S16});
750   else if (ST.hasMadMacF32Insts())
751     FMad.customFor({S32});
752   else if (ST.hasMadF16())
753     FMad.customFor({S16});
754   FMad.scalarize(0)
755       .lower();
756 
757   auto &FRem = getActionDefinitionsBuilder(G_FREM);
758   if (ST.has16BitInsts()) {
759     FRem.customFor({S16, S32, S64});
760   } else {
761     FRem.minScalar(0, S32)
762         .customFor({S32, S64});
763   }
764   FRem.scalarize(0);
765 
766   // TODO: Do we need to clamp maximum bitwidth?
767   getActionDefinitionsBuilder(G_TRUNC)
768     .legalIf(isScalar(0))
769     .legalFor({{V2S16, V2S32}})
770     .clampMaxNumElements(0, S16, 2)
771     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
772     // situations (like an invalid implicit use), we don't want to infinite loop
773     // in the legalizer.
774     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
775     .alwaysLegal();
776 
777   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
778     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
779                {S32, S1}, {S64, S1}, {S16, S1}})
780     .scalarize(0)
781     .clampScalar(0, S32, S64)
782     .widenScalarToNextPow2(1, 32);
783 
784   // TODO: Split s1->s64 during regbankselect for VALU.
785   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
786     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
787     .lowerFor({{S32, S64}})
788     .lowerIf(typeIs(1, S1))
789     .customFor({{S64, S64}});
790   if (ST.has16BitInsts())
791     IToFP.legalFor({{S16, S16}});
792   IToFP.clampScalar(1, S32, S64)
793        .minScalar(0, S32)
794        .scalarize(0)
795        .widenScalarToNextPow2(1);
796 
797   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
798     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
799     .customFor({{S64, S64}})
800     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
801   if (ST.has16BitInsts())
802     FPToI.legalFor({{S16, S16}});
803   else
804     FPToI.minScalar(1, S32);
805 
806   FPToI.minScalar(0, S32)
807        .scalarize(0)
808        .lower();
809 
810   // Lower roundeven into G_FRINT
811   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
812     .scalarize(0)
813     .lower();
814 
815   if (ST.has16BitInsts()) {
816     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
817       .legalFor({S16, S32, S64})
818       .clampScalar(0, S16, S64)
819       .scalarize(0);
820   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
821     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822       .legalFor({S32, S64})
823       .clampScalar(0, S32, S64)
824       .scalarize(0);
825   } else {
826     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827       .legalFor({S32})
828       .customFor({S64})
829       .clampScalar(0, S32, S64)
830       .scalarize(0);
831   }
832 
833   getActionDefinitionsBuilder(G_PTR_ADD)
834     .legalIf(all(isPointer(0), sameSize(0, 1)))
835     .scalarize(0)
836     .scalarSameSizeAs(1, 0);
837 
838   getActionDefinitionsBuilder(G_PTRMASK)
839     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
840     .scalarSameSizeAs(1, 0)
841     .scalarize(0);
842 
843   auto &CmpBuilder =
844     getActionDefinitionsBuilder(G_ICMP)
845     // The compare output type differs based on the register bank of the output,
846     // so make both s1 and s32 legal.
847     //
848     // Scalar compares producing output in scc will be promoted to s32, as that
849     // is the allocatable register type that will be needed for the copy from
850     // scc. This will be promoted during RegBankSelect, and we assume something
851     // before that won't try to use s32 result types.
852     //
853     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
854     // bank.
855     .legalForCartesianProduct(
856       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
857     .legalForCartesianProduct(
858       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
859   if (ST.has16BitInsts()) {
860     CmpBuilder.legalFor({{S1, S16}});
861   }
862 
863   CmpBuilder
864     .widenScalarToNextPow2(1)
865     .clampScalar(1, S32, S64)
866     .scalarize(0)
867     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
868 
869   getActionDefinitionsBuilder(G_FCMP)
870     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
871     .widenScalarToNextPow2(1)
872     .clampScalar(1, S32, S64)
873     .scalarize(0);
874 
875   // FIXME: fpow has a selection pattern that should move to custom lowering.
876   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
877   if (ST.has16BitInsts())
878     Exp2Ops.legalFor({S32, S16});
879   else
880     Exp2Ops.legalFor({S32});
881   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
882   Exp2Ops.scalarize(0);
883 
884   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
885   if (ST.has16BitInsts())
886     ExpOps.customFor({{S32}, {S16}});
887   else
888     ExpOps.customFor({S32});
889   ExpOps.clampScalar(0, MinScalarFPTy, S32)
890         .scalarize(0);
891 
892   getActionDefinitionsBuilder(G_FPOWI)
893     .clampScalar(0, MinScalarFPTy, S32)
894     .lower();
895 
896   // The 64-bit versions produce 32-bit results, but only on the SALU.
897   getActionDefinitionsBuilder(G_CTPOP)
898     .legalFor({{S32, S32}, {S32, S64}})
899     .clampScalar(0, S32, S32)
900     .clampScalar(1, S32, S64)
901     .scalarize(0)
902     .widenScalarToNextPow2(0, 32)
903     .widenScalarToNextPow2(1, 32);
904 
905   // The hardware instructions return a different result on 0 than the generic
906   // instructions expect. The hardware produces -1, but these produce the
907   // bitwidth.
908   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
909     .scalarize(0)
910     .clampScalar(0, S32, S32)
911     .clampScalar(1, S32, S64)
912     .widenScalarToNextPow2(0, 32)
913     .widenScalarToNextPow2(1, 32)
914     .lower();
915 
916   // The 64-bit versions produce 32-bit results, but only on the SALU.
917   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
918     .legalFor({{S32, S32}, {S32, S64}})
919     .clampScalar(0, S32, S32)
920     .clampScalar(1, S32, S64)
921     .scalarize(0)
922     .widenScalarToNextPow2(0, 32)
923     .widenScalarToNextPow2(1, 32);
924 
925   getActionDefinitionsBuilder(G_BITREVERSE)
926     .legalFor({S32})
927     .clampScalar(0, S32, S32)
928     .scalarize(0);
929 
930   if (ST.has16BitInsts()) {
931     getActionDefinitionsBuilder(G_BSWAP)
932       .legalFor({S16, S32, V2S16})
933       .clampMaxNumElements(0, S16, 2)
934       // FIXME: Fixing non-power-of-2 before clamp is workaround for
935       // narrowScalar limitation.
936       .widenScalarToNextPow2(0)
937       .clampScalar(0, S16, S32)
938       .scalarize(0);
939 
940     if (ST.hasVOP3PInsts()) {
941       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
942         .legalFor({S32, S16, V2S16})
943         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
944         .clampMaxNumElements(0, S16, 2)
945         .minScalar(0, S16)
946         .widenScalarToNextPow2(0)
947         .scalarize(0)
948         .lower();
949     } else {
950       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
951         .legalFor({S32, S16})
952         .widenScalarToNextPow2(0)
953         .minScalar(0, S16)
954         .scalarize(0)
955         .lower();
956     }
957   } else {
958     // TODO: Should have same legality without v_perm_b32
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S32})
961       .lowerIf(scalarNarrowerThan(0, 32))
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .maxScalar(0, S32)
966       .scalarize(0)
967       .lower();
968 
969     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970       .legalFor({S32})
971       .minScalar(0, S32)
972       .widenScalarToNextPow2(0)
973       .scalarize(0)
974       .lower();
975   }
976 
977   getActionDefinitionsBuilder(G_INTTOPTR)
978     // List the common cases
979     .legalForCartesianProduct(AddrSpaces64, {S64})
980     .legalForCartesianProduct(AddrSpaces32, {S32})
981     .scalarize(0)
982     // Accept any address space as long as the size matches
983     .legalIf(sameSize(0, 1))
984     .widenScalarIf(smallerThan(1, 0),
985       [](const LegalityQuery &Query) {
986         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
987       })
988     .narrowScalarIf(largerThan(1, 0),
989       [](const LegalityQuery &Query) {
990         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
991       });
992 
993   getActionDefinitionsBuilder(G_PTRTOINT)
994     // List the common cases
995     .legalForCartesianProduct(AddrSpaces64, {S64})
996     .legalForCartesianProduct(AddrSpaces32, {S32})
997     .scalarize(0)
998     // Accept any address space as long as the size matches
999     .legalIf(sameSize(0, 1))
1000     .widenScalarIf(smallerThan(0, 1),
1001       [](const LegalityQuery &Query) {
1002         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1003       })
1004     .narrowScalarIf(
1005       largerThan(0, 1),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008       });
1009 
1010   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1011     .scalarize(0)
1012     .custom();
1013 
1014   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1015                                     bool IsLoad) -> bool {
1016     const LLT DstTy = Query.Types[0];
1017 
1018     // Split vector extloads.
1019     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1021 
1022     if (MemSize < DstTy.getSizeInBits())
1023       MemSize = std::max(MemSize, AlignBits);
1024 
1025     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1026       return true;
1027 
1028     const LLT PtrTy = Query.Types[1];
1029     unsigned AS = PtrTy.getAddressSpace();
1030     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1031       return true;
1032 
1033     // Catch weird sized loads that don't evenly divide into the access sizes
1034     // TODO: May be able to widen depending on alignment etc.
1035     unsigned NumRegs = (MemSize + 31) / 32;
1036     if (NumRegs == 3) {
1037       if (!ST.hasDwordx3LoadStores())
1038         return true;
1039     } else {
1040       // If the alignment allows, these should have been widened.
1041       if (!isPowerOf2_32(NumRegs))
1042         return true;
1043     }
1044 
1045     if (AlignBits < MemSize) {
1046       const SITargetLowering *TLI = ST.getTargetLowering();
1047       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1048                                                       Align(AlignBits / 8));
1049     }
1050 
1051     return false;
1052   };
1053 
1054   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1055   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1056   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1057 
1058   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1059   // LDS
1060   // TODO: Unsupported flat for SI.
1061 
1062   for (unsigned Op : {G_LOAD, G_STORE}) {
1063     const bool IsStore = Op == G_STORE;
1064 
1065     auto &Actions = getActionDefinitionsBuilder(Op);
1066     // Explicitly list some common cases.
1067     // TODO: Does this help compile time at all?
1068     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1069                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1070                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1071                                       {S64, GlobalPtr, 64, GlobalAlign32},
1072                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1073                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1074                                       {S32, GlobalPtr, 8, GlobalAlign8},
1075                                       {S32, GlobalPtr, 16, GlobalAlign16},
1076 
1077                                       {S32, LocalPtr, 32, 32},
1078                                       {S64, LocalPtr, 64, 32},
1079                                       {V2S32, LocalPtr, 64, 32},
1080                                       {S32, LocalPtr, 8, 8},
1081                                       {S32, LocalPtr, 16, 16},
1082                                       {V2S16, LocalPtr, 32, 32},
1083 
1084                                       {S32, PrivatePtr, 32, 32},
1085                                       {S32, PrivatePtr, 8, 8},
1086                                       {S32, PrivatePtr, 16, 16},
1087                                       {V2S16, PrivatePtr, 32, 32},
1088 
1089                                       {S32, ConstantPtr, 32, GlobalAlign32},
1090                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1091                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1092                                       {S64, ConstantPtr, 64, GlobalAlign32},
1093                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1094     Actions.legalIf(
1095       [=](const LegalityQuery &Query) -> bool {
1096         return isLoadStoreLegal(ST, Query, Op);
1097       });
1098 
1099     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1100     // 64-bits.
1101     //
1102     // TODO: Should generalize bitcast action into coerce, which will also cover
1103     // inserting addrspacecasts.
1104     Actions.customIf(typeIs(1, Constant32Ptr));
1105 
1106     // Turn any illegal element vectors into something easier to deal
1107     // with. These will ultimately produce 32-bit scalar shifts to extract the
1108     // parts anyway.
1109     //
1110     // For odd 16-bit element vectors, prefer to split those into pieces with
1111     // 16-bit vector parts.
1112     Actions.bitcastIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1115                                           Query.MMODescrs[0].SizeInBits);
1116       }, bitcastToRegisterType(0));
1117 
1118     if (!IsStore) {
1119       // Widen suitably aligned loads by loading extra bytes. The standard
1120       // legalization actions can't properly express widening memory operands.
1121       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1122         return shouldWidenLoad(ST, Query, G_LOAD);
1123       });
1124     }
1125 
1126     // FIXME: load/store narrowing should be moved to lower action
1127     Actions
1128         .narrowScalarIf(
1129             [=](const LegalityQuery &Query) -> bool {
1130               return !Query.Types[0].isVector() &&
1131                      needToSplitMemOp(Query, Op == G_LOAD);
1132             },
1133             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1134               const LLT DstTy = Query.Types[0];
1135               const LLT PtrTy = Query.Types[1];
1136 
1137               const unsigned DstSize = DstTy.getSizeInBits();
1138               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1139 
1140               // Split extloads.
1141               if (DstSize > MemSize)
1142                 return std::make_pair(0, LLT::scalar(MemSize));
1143 
1144               if (!isPowerOf2_32(DstSize)) {
1145                 // We're probably decomposing an odd sized store. Try to split
1146                 // to the widest type. TODO: Account for alignment. As-is it
1147                 // should be OK, since the new parts will be further legalized.
1148                 unsigned FloorSize = PowerOf2Floor(DstSize);
1149                 return std::make_pair(0, LLT::scalar(FloorSize));
1150               }
1151 
1152               if (DstSize > 32 && (DstSize % 32 != 0)) {
1153                 // FIXME: Need a way to specify non-extload of larger size if
1154                 // suitably aligned.
1155                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1156               }
1157 
1158               unsigned MaxSize = maxSizeForAddrSpace(ST,
1159                                                      PtrTy.getAddressSpace(),
1160                                                      Op == G_LOAD);
1161               if (MemSize > MaxSize)
1162                 return std::make_pair(0, LLT::scalar(MaxSize));
1163 
1164               unsigned Align = Query.MMODescrs[0].AlignInBits;
1165               return std::make_pair(0, LLT::scalar(Align));
1166             })
1167         .fewerElementsIf(
1168             [=](const LegalityQuery &Query) -> bool {
1169               return Query.Types[0].isVector() &&
1170                      needToSplitMemOp(Query, Op == G_LOAD);
1171             },
1172             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1173               const LLT DstTy = Query.Types[0];
1174               const LLT PtrTy = Query.Types[1];
1175 
1176               LLT EltTy = DstTy.getElementType();
1177               unsigned MaxSize = maxSizeForAddrSpace(ST,
1178                                                      PtrTy.getAddressSpace(),
1179                                                      Op == G_LOAD);
1180 
1181               // FIXME: Handle widened to power of 2 results better. This ends
1182               // up scalarizing.
1183               // FIXME: 3 element stores scalarized on SI
1184 
1185               // Split if it's too large for the address space.
1186               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1187                 unsigned NumElts = DstTy.getNumElements();
1188                 unsigned EltSize = EltTy.getSizeInBits();
1189 
1190                 if (MaxSize % EltSize == 0) {
1191                   return std::make_pair(
1192                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1193                 }
1194 
1195                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1196 
1197                 // FIXME: Refine when odd breakdowns handled
1198                 // The scalars will need to be re-legalized.
1199                 if (NumPieces == 1 || NumPieces >= NumElts ||
1200                     NumElts % NumPieces != 0)
1201                   return std::make_pair(0, EltTy);
1202 
1203                 return std::make_pair(0,
1204                                       LLT::vector(NumElts / NumPieces, EltTy));
1205               }
1206 
1207               // FIXME: We could probably handle weird extending loads better.
1208               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1209               if (DstTy.getSizeInBits() > MemSize)
1210                 return std::make_pair(0, EltTy);
1211 
1212               unsigned EltSize = EltTy.getSizeInBits();
1213               unsigned DstSize = DstTy.getSizeInBits();
1214               if (!isPowerOf2_32(DstSize)) {
1215                 // We're probably decomposing an odd sized store. Try to split
1216                 // to the widest type. TODO: Account for alignment. As-is it
1217                 // should be OK, since the new parts will be further legalized.
1218                 unsigned FloorSize = PowerOf2Floor(DstSize);
1219                 return std::make_pair(
1220                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1221               }
1222 
1223               // Need to split because of alignment.
1224               unsigned Align = Query.MMODescrs[0].AlignInBits;
1225               if (EltSize > Align &&
1226                   (EltSize / Align < DstTy.getNumElements())) {
1227                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1228               }
1229 
1230               // May need relegalization for the scalars.
1231               return std::make_pair(0, EltTy);
1232             })
1233     .lowerIfMemSizeNotPow2()
1234     .minScalar(0, S32);
1235 
1236     if (IsStore)
1237       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1238 
1239     Actions
1240         .widenScalarToNextPow2(0)
1241         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1242         .lower();
1243   }
1244 
1245   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1246                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1247                                                   {S32, GlobalPtr, 16, 2 * 8},
1248                                                   {S32, LocalPtr, 8, 8},
1249                                                   {S32, LocalPtr, 16, 16},
1250                                                   {S32, PrivatePtr, 8, 8},
1251                                                   {S32, PrivatePtr, 16, 16},
1252                                                   {S32, ConstantPtr, 8, 8},
1253                                                   {S32, ConstantPtr, 16, 2 * 8}});
1254   if (ST.hasFlatAddressSpace()) {
1255     ExtLoads.legalForTypesWithMemDesc(
1256         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1257   }
1258 
1259   ExtLoads.clampScalar(0, S32, S32)
1260           .widenScalarToNextPow2(0)
1261           .unsupportedIfMemSizeNotPow2()
1262           .lower();
1263 
1264   auto &Atomics = getActionDefinitionsBuilder(
1265     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1266      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1267      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1268      G_ATOMICRMW_UMIN})
1269     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1270                {S64, GlobalPtr}, {S64, LocalPtr},
1271                {S32, RegionPtr}, {S64, RegionPtr}});
1272   if (ST.hasFlatAddressSpace()) {
1273     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1274   }
1275 
1276   if (ST.hasLDSFPAtomics()) {
1277     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1278       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1279   }
1280 
1281   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1282   // demarshalling
1283   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1284     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1285                 {S32, FlatPtr}, {S64, FlatPtr}})
1286     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1287                {S32, RegionPtr}, {S64, RegionPtr}});
1288   // TODO: Pointer types, any 32-bit or 64-bit vector
1289 
1290   // Condition should be s32 for scalar, s1 for vector.
1291   getActionDefinitionsBuilder(G_SELECT)
1292     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1293           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1294           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1295     .clampScalar(0, S16, S64)
1296     .scalarize(1)
1297     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1298     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1299     .clampMaxNumElements(0, S32, 2)
1300     .clampMaxNumElements(0, LocalPtr, 2)
1301     .clampMaxNumElements(0, PrivatePtr, 2)
1302     .scalarize(0)
1303     .widenScalarToNextPow2(0)
1304     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1305 
1306   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1307   // be more flexible with the shift amount type.
1308   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1309     .legalFor({{S32, S32}, {S64, S32}});
1310   if (ST.has16BitInsts()) {
1311     if (ST.hasVOP3PInsts()) {
1312       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1313             .clampMaxNumElements(0, S16, 2);
1314     } else
1315       Shifts.legalFor({{S16, S16}});
1316 
1317     // TODO: Support 16-bit shift amounts for all types
1318     Shifts.widenScalarIf(
1319       [=](const LegalityQuery &Query) {
1320         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1321         // 32-bit amount.
1322         const LLT ValTy = Query.Types[0];
1323         const LLT AmountTy = Query.Types[1];
1324         return ValTy.getSizeInBits() <= 16 &&
1325                AmountTy.getSizeInBits() < 16;
1326       }, changeTo(1, S16));
1327     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1328     Shifts.clampScalar(1, S32, S32);
1329     Shifts.clampScalar(0, S16, S64);
1330     Shifts.widenScalarToNextPow2(0, 16);
1331 
1332     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1333       .minScalar(0, S16)
1334       .scalarize(0)
1335       .lower();
1336   } else {
1337     // Make sure we legalize the shift amount type first, as the general
1338     // expansion for the shifted type will produce much worse code if it hasn't
1339     // been truncated already.
1340     Shifts.clampScalar(1, S32, S32);
1341     Shifts.clampScalar(0, S32, S64);
1342     Shifts.widenScalarToNextPow2(0, 32);
1343 
1344     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1345       .minScalar(0, S32)
1346       .scalarize(0)
1347       .lower();
1348   }
1349   Shifts.scalarize(0);
1350 
1351   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1352     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1353     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1354     unsigned IdxTypeIdx = 2;
1355 
1356     getActionDefinitionsBuilder(Op)
1357       .customIf([=](const LegalityQuery &Query) {
1358           const LLT EltTy = Query.Types[EltTypeIdx];
1359           const LLT VecTy = Query.Types[VecTypeIdx];
1360           const LLT IdxTy = Query.Types[IdxTypeIdx];
1361           const unsigned EltSize = EltTy.getSizeInBits();
1362           return (EltSize == 32 || EltSize == 64) &&
1363                   VecTy.getSizeInBits() % 32 == 0 &&
1364                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1365                   IdxTy.getSizeInBits() == 32;
1366         })
1367       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1368                  bitcastToVectorElement32(VecTypeIdx))
1369       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1370       .bitcastIf(
1371         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1372         [=](const LegalityQuery &Query) {
1373           // For > 64-bit element types, try to turn this into a 64-bit
1374           // element vector since we may be able to do better indexing
1375           // if this is scalar. If not, fall back to 32.
1376           const LLT EltTy = Query.Types[EltTypeIdx];
1377           const LLT VecTy = Query.Types[VecTypeIdx];
1378           const unsigned DstEltSize = EltTy.getSizeInBits();
1379           const unsigned VecSize = VecTy.getSizeInBits();
1380 
1381           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1382           return std::make_pair(
1383             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1384         })
1385       .clampScalar(EltTypeIdx, S32, S64)
1386       .clampScalar(VecTypeIdx, S32, S64)
1387       .clampScalar(IdxTypeIdx, S32, S32)
1388       .clampMaxNumElements(VecTypeIdx, S32, 32)
1389       // TODO: Clamp elements for 64-bit vectors?
1390       // It should only be necessary with variable indexes.
1391       // As a last resort, lower to the stack
1392       .lower();
1393   }
1394 
1395   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1396     .unsupportedIf([=](const LegalityQuery &Query) {
1397         const LLT &EltTy = Query.Types[1].getElementType();
1398         return Query.Types[0] != EltTy;
1399       });
1400 
1401   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1402     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1403     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1404 
1405     // FIXME: Doesn't handle extract of illegal sizes.
1406     getActionDefinitionsBuilder(Op)
1407       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1408       // FIXME: Multiples of 16 should not be legal.
1409       .legalIf([=](const LegalityQuery &Query) {
1410           const LLT BigTy = Query.Types[BigTyIdx];
1411           const LLT LitTy = Query.Types[LitTyIdx];
1412           return (BigTy.getSizeInBits() % 32 == 0) &&
1413                  (LitTy.getSizeInBits() % 16 == 0);
1414         })
1415       .widenScalarIf(
1416         [=](const LegalityQuery &Query) {
1417           const LLT BigTy = Query.Types[BigTyIdx];
1418           return (BigTy.getScalarSizeInBits() < 16);
1419         },
1420         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1421       .widenScalarIf(
1422         [=](const LegalityQuery &Query) {
1423           const LLT LitTy = Query.Types[LitTyIdx];
1424           return (LitTy.getScalarSizeInBits() < 16);
1425         },
1426         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1427       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1428       .widenScalarToNextPow2(BigTyIdx, 32);
1429 
1430   }
1431 
1432   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1433     .legalForCartesianProduct(AllS32Vectors, {S32})
1434     .legalForCartesianProduct(AllS64Vectors, {S64})
1435     .clampNumElements(0, V16S32, V32S32)
1436     .clampNumElements(0, V2S64, V16S64)
1437     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1438 
1439   if (ST.hasScalarPackInsts()) {
1440     BuildVector
1441       // FIXME: Should probably widen s1 vectors straight to s32
1442       .minScalarOrElt(0, S16)
1443       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1444       .minScalar(1, S32);
1445 
1446     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1447       .legalFor({V2S16, S32})
1448       .lower();
1449     BuildVector.minScalarOrElt(0, S32);
1450   } else {
1451     BuildVector.customFor({V2S16, S16});
1452     BuildVector.minScalarOrElt(0, S32);
1453 
1454     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1455       .customFor({V2S16, S32})
1456       .lower();
1457   }
1458 
1459   BuildVector.legalIf(isRegisterType(0));
1460 
1461   // FIXME: Clamp maximum size
1462   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1463     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1464     .clampMaxNumElements(0, S32, 32)
1465     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1466     .clampMaxNumElements(0, S16, 64);
1467 
1468   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1469   // pre-legalize.
1470   if (ST.hasVOP3PInsts()) {
1471     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1472       .customFor({V2S16, V2S16})
1473       .lower();
1474   } else
1475     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1476 
1477   // Merge/Unmerge
1478   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1479     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1480     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1481 
1482     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1483       const LLT Ty = Query.Types[TypeIdx];
1484       if (Ty.isVector()) {
1485         const LLT &EltTy = Ty.getElementType();
1486         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1487           return true;
1488         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1489           return true;
1490       }
1491       return false;
1492     };
1493 
1494     auto &Builder = getActionDefinitionsBuilder(Op)
1495       .lowerFor({{S16, V2S16}})
1496       .lowerIf([=](const LegalityQuery &Query) {
1497           const LLT BigTy = Query.Types[BigTyIdx];
1498           return BigTy.getSizeInBits() == 32;
1499         })
1500       // Try to widen to s16 first for small types.
1501       // TODO: Only do this on targets with legal s16 shifts
1502       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1503       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1504       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1505       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1506                            elementTypeIs(1, S16)),
1507                        changeTo(1, V2S16))
1508       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1509       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1510       // valid.
1511       .clampScalar(LitTyIdx, S32, S512)
1512       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1513       // Break up vectors with weird elements into scalars
1514       .fewerElementsIf(
1515         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1516         scalarize(0))
1517       .fewerElementsIf(
1518         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1519         scalarize(1))
1520       .clampScalar(BigTyIdx, S32, MaxScalar);
1521 
1522     if (Op == G_MERGE_VALUES) {
1523       Builder.widenScalarIf(
1524         // TODO: Use 16-bit shifts if legal for 8-bit values?
1525         [=](const LegalityQuery &Query) {
1526           const LLT Ty = Query.Types[LitTyIdx];
1527           return Ty.getSizeInBits() < 32;
1528         },
1529         changeTo(LitTyIdx, S32));
1530     }
1531 
1532     Builder.widenScalarIf(
1533       [=](const LegalityQuery &Query) {
1534         const LLT Ty = Query.Types[BigTyIdx];
1535         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1536           Ty.getSizeInBits() % 16 != 0;
1537       },
1538       [=](const LegalityQuery &Query) {
1539         // Pick the next power of 2, or a multiple of 64 over 128.
1540         // Whichever is smaller.
1541         const LLT &Ty = Query.Types[BigTyIdx];
1542         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1543         if (NewSizeInBits >= 256) {
1544           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1545           if (RoundedTo < NewSizeInBits)
1546             NewSizeInBits = RoundedTo;
1547         }
1548         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1549       })
1550       .legalIf([=](const LegalityQuery &Query) {
1551           const LLT &BigTy = Query.Types[BigTyIdx];
1552           const LLT &LitTy = Query.Types[LitTyIdx];
1553 
1554           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1555             return false;
1556           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1557             return false;
1558 
1559           return BigTy.getSizeInBits() % 16 == 0 &&
1560                  LitTy.getSizeInBits() % 16 == 0 &&
1561                  BigTy.getSizeInBits() <= MaxRegisterSize;
1562         })
1563       // Any vectors left are the wrong size. Scalarize them.
1564       .scalarize(0)
1565       .scalarize(1);
1566   }
1567 
1568   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1569   // RegBankSelect.
1570   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1571     .legalFor({{S32}, {S64}});
1572 
1573   if (ST.hasVOP3PInsts()) {
1574     SextInReg.lowerFor({{V2S16}})
1575       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1576       // get more vector shift opportunities, since we'll get those when
1577       // expanded.
1578       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1579   } else if (ST.has16BitInsts()) {
1580     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1581   } else {
1582     // Prefer to promote to s32 before lowering if we don't have 16-bit
1583     // shifts. This avoid a lot of intermediate truncate and extend operations.
1584     SextInReg.lowerFor({{S32}, {S64}});
1585   }
1586 
1587   SextInReg
1588     .scalarize(0)
1589     .clampScalar(0, S32, S64)
1590     .lower();
1591 
1592   getActionDefinitionsBuilder(G_FSHR)
1593     .legalFor({{S32, S32}})
1594     .scalarize(0)
1595     .lower();
1596 
1597   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1598     .legalFor({S64});
1599 
1600   getActionDefinitionsBuilder(G_FENCE)
1601     .alwaysLegal();
1602 
1603   getActionDefinitionsBuilder({
1604       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1605       G_FCOPYSIGN,
1606 
1607       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1608       G_ATOMICRMW_NAND,
1609       G_ATOMICRMW_FSUB,
1610       G_READ_REGISTER,
1611       G_WRITE_REGISTER,
1612 
1613       G_SADDO, G_SSUBO,
1614 
1615        // TODO: Implement
1616       G_FMINIMUM, G_FMAXIMUM,
1617       G_FSHL
1618     }).lower();
1619 
1620   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1621         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1622         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1623     .unsupported();
1624 
1625   computeTables();
1626   verify(*ST.getInstrInfo());
1627 }
1628 
1629 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1630                                          MachineInstr &MI) const {
1631   MachineIRBuilder &B = Helper.MIRBuilder;
1632   MachineRegisterInfo &MRI = *B.getMRI();
1633 
1634   switch (MI.getOpcode()) {
1635   case TargetOpcode::G_ADDRSPACE_CAST:
1636     return legalizeAddrSpaceCast(MI, MRI, B);
1637   case TargetOpcode::G_FRINT:
1638     return legalizeFrint(MI, MRI, B);
1639   case TargetOpcode::G_FCEIL:
1640     return legalizeFceil(MI, MRI, B);
1641   case TargetOpcode::G_FREM:
1642     return legalizeFrem(MI, MRI, B);
1643   case TargetOpcode::G_INTRINSIC_TRUNC:
1644     return legalizeIntrinsicTrunc(MI, MRI, B);
1645   case TargetOpcode::G_SITOFP:
1646     return legalizeITOFP(MI, MRI, B, true);
1647   case TargetOpcode::G_UITOFP:
1648     return legalizeITOFP(MI, MRI, B, false);
1649   case TargetOpcode::G_FPTOSI:
1650     return legalizeFPTOI(MI, MRI, B, true);
1651   case TargetOpcode::G_FPTOUI:
1652     return legalizeFPTOI(MI, MRI, B, false);
1653   case TargetOpcode::G_FMINNUM:
1654   case TargetOpcode::G_FMAXNUM:
1655   case TargetOpcode::G_FMINNUM_IEEE:
1656   case TargetOpcode::G_FMAXNUM_IEEE:
1657     return legalizeMinNumMaxNum(Helper, MI);
1658   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1659     return legalizeExtractVectorElt(MI, MRI, B);
1660   case TargetOpcode::G_INSERT_VECTOR_ELT:
1661     return legalizeInsertVectorElt(MI, MRI, B);
1662   case TargetOpcode::G_SHUFFLE_VECTOR:
1663     return legalizeShuffleVector(MI, MRI, B);
1664   case TargetOpcode::G_FSIN:
1665   case TargetOpcode::G_FCOS:
1666     return legalizeSinCos(MI, MRI, B);
1667   case TargetOpcode::G_GLOBAL_VALUE:
1668     return legalizeGlobalValue(MI, MRI, B);
1669   case TargetOpcode::G_LOAD:
1670     return legalizeLoad(Helper, MI);
1671   case TargetOpcode::G_FMAD:
1672     return legalizeFMad(MI, MRI, B);
1673   case TargetOpcode::G_FDIV:
1674     return legalizeFDIV(MI, MRI, B);
1675   case TargetOpcode::G_UDIV:
1676   case TargetOpcode::G_UREM:
1677     return legalizeUDIV_UREM(MI, MRI, B);
1678   case TargetOpcode::G_SDIV:
1679   case TargetOpcode::G_SREM:
1680     return legalizeSDIV_SREM(MI, MRI, B);
1681   case TargetOpcode::G_ATOMIC_CMPXCHG:
1682     return legalizeAtomicCmpXChg(MI, MRI, B);
1683   case TargetOpcode::G_FLOG:
1684     return legalizeFlog(MI, B, numbers::ln2f);
1685   case TargetOpcode::G_FLOG10:
1686     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1687   case TargetOpcode::G_FEXP:
1688     return legalizeFExp(MI, B);
1689   case TargetOpcode::G_FPOW:
1690     return legalizeFPow(MI, B);
1691   case TargetOpcode::G_FFLOOR:
1692     return legalizeFFloor(MI, MRI, B);
1693   case TargetOpcode::G_BUILD_VECTOR:
1694     return legalizeBuildVector(MI, MRI, B);
1695   default:
1696     return false;
1697   }
1698 
1699   llvm_unreachable("expected switch to return");
1700 }
1701 
1702 Register AMDGPULegalizerInfo::getSegmentAperture(
1703   unsigned AS,
1704   MachineRegisterInfo &MRI,
1705   MachineIRBuilder &B) const {
1706   MachineFunction &MF = B.getMF();
1707   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1708   const LLT S32 = LLT::scalar(32);
1709 
1710   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1711 
1712   if (ST.hasApertureRegs()) {
1713     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1714     // getreg.
1715     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1716         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1717         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1718     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1719         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1720         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1721     unsigned Encoding =
1722         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1723         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1724         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1725 
1726     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1727 
1728     B.buildInstr(AMDGPU::S_GETREG_B32)
1729       .addDef(GetReg)
1730       .addImm(Encoding);
1731     MRI.setType(GetReg, S32);
1732 
1733     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1734     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1735   }
1736 
1737   Register QueuePtr = MRI.createGenericVirtualRegister(
1738     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1739 
1740   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1741     return Register();
1742 
1743   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1744   // private_segment_aperture_base_hi.
1745   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1746 
1747   // TODO: can we be smarter about machine pointer info?
1748   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1749   MachineMemOperand *MMO = MF.getMachineMemOperand(
1750       PtrInfo,
1751       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1752           MachineMemOperand::MOInvariant,
1753       4, commonAlignment(Align(64), StructOffset));
1754 
1755   Register LoadAddr;
1756 
1757   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1758   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1759 }
1760 
1761 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1762   MachineInstr &MI, MachineRegisterInfo &MRI,
1763   MachineIRBuilder &B) const {
1764   MachineFunction &MF = B.getMF();
1765 
1766   const LLT S32 = LLT::scalar(32);
1767   Register Dst = MI.getOperand(0).getReg();
1768   Register Src = MI.getOperand(1).getReg();
1769 
1770   LLT DstTy = MRI.getType(Dst);
1771   LLT SrcTy = MRI.getType(Src);
1772   unsigned DestAS = DstTy.getAddressSpace();
1773   unsigned SrcAS = SrcTy.getAddressSpace();
1774 
1775   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1776   // vector element.
1777   assert(!DstTy.isVector());
1778 
1779   const AMDGPUTargetMachine &TM
1780     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1781 
1782   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1783     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1784     return true;
1785   }
1786 
1787   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788     // Truncate.
1789     B.buildExtract(Dst, Src, 0);
1790     MI.eraseFromParent();
1791     return true;
1792   }
1793 
1794   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1795     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1796     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1797 
1798     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1799     // another. Merge operands are required to be the same type, but creating an
1800     // extra ptrtoint would be kind of pointless.
1801     auto HighAddr = B.buildConstant(
1802       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1803     B.buildMerge(Dst, {Src, HighAddr});
1804     MI.eraseFromParent();
1805     return true;
1806   }
1807 
1808   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1809     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1810            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1811     unsigned NullVal = TM.getNullPointerValue(DestAS);
1812 
1813     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1814     auto FlatNull = B.buildConstant(SrcTy, 0);
1815 
1816     // Extract low 32-bits of the pointer.
1817     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1818 
1819     auto CmpRes =
1820         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1821     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1822 
1823     MI.eraseFromParent();
1824     return true;
1825   }
1826 
1827   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1828     return false;
1829 
1830   if (!ST.hasFlatAddressSpace())
1831     return false;
1832 
1833   auto SegmentNull =
1834       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1835   auto FlatNull =
1836       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1837 
1838   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1839   if (!ApertureReg.isValid())
1840     return false;
1841 
1842   auto CmpRes =
1843       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1844 
1845   // Coerce the type of the low half of the result so we can use merge_values.
1846   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1847 
1848   // TODO: Should we allow mismatched types but matching sizes in merges to
1849   // avoid the ptrtoint?
1850   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1851   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1852 
1853   MI.eraseFromParent();
1854   return true;
1855 }
1856 
1857 bool AMDGPULegalizerInfo::legalizeFrint(
1858   MachineInstr &MI, MachineRegisterInfo &MRI,
1859   MachineIRBuilder &B) const {
1860   Register Src = MI.getOperand(1).getReg();
1861   LLT Ty = MRI.getType(Src);
1862   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1863 
1864   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1865   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1866 
1867   auto C1 = B.buildFConstant(Ty, C1Val);
1868   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1869 
1870   // TODO: Should this propagate fast-math-flags?
1871   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1872   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1873 
1874   auto C2 = B.buildFConstant(Ty, C2Val);
1875   auto Fabs = B.buildFAbs(Ty, Src);
1876 
1877   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1878   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeFceil(
1884   MachineInstr &MI, MachineRegisterInfo &MRI,
1885   MachineIRBuilder &B) const {
1886 
1887   const LLT S1 = LLT::scalar(1);
1888   const LLT S64 = LLT::scalar(64);
1889 
1890   Register Src = MI.getOperand(1).getReg();
1891   assert(MRI.getType(Src) == S64);
1892 
1893   // result = trunc(src)
1894   // if (src > 0.0 && src != result)
1895   //   result += 1.0
1896 
1897   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1898 
1899   const auto Zero = B.buildFConstant(S64, 0.0);
1900   const auto One = B.buildFConstant(S64, 1.0);
1901   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1902   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1903   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1904   auto Add = B.buildSelect(S64, And, One, Zero);
1905 
1906   // TODO: Should this propagate fast-math-flags?
1907   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFrem(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914     Register DstReg = MI.getOperand(0).getReg();
1915     Register Src0Reg = MI.getOperand(1).getReg();
1916     Register Src1Reg = MI.getOperand(2).getReg();
1917     auto Flags = MI.getFlags();
1918     LLT Ty = MRI.getType(DstReg);
1919 
1920     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1921     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1922     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1923     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1924     MI.eraseFromParent();
1925     return true;
1926 }
1927 
1928 static MachineInstrBuilder extractF64Exponent(Register Hi,
1929                                               MachineIRBuilder &B) {
1930   const unsigned FractBits = 52;
1931   const unsigned ExpBits = 11;
1932   LLT S32 = LLT::scalar(32);
1933 
1934   auto Const0 = B.buildConstant(S32, FractBits - 32);
1935   auto Const1 = B.buildConstant(S32, ExpBits);
1936 
1937   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1938     .addUse(Hi)
1939     .addUse(Const0.getReg(0))
1940     .addUse(Const1.getReg(0));
1941 
1942   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   const LLT S1 = LLT::scalar(1);
1949   const LLT S32 = LLT::scalar(32);
1950   const LLT S64 = LLT::scalar(64);
1951 
1952   Register Src = MI.getOperand(1).getReg();
1953   assert(MRI.getType(Src) == S64);
1954 
1955   // TODO: Should this use extract since the low half is unused?
1956   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1957   Register Hi = Unmerge.getReg(1);
1958 
1959   // Extract the upper half, since this is where we will find the sign and
1960   // exponent.
1961   auto Exp = extractF64Exponent(Hi, B);
1962 
1963   const unsigned FractBits = 52;
1964 
1965   // Extract the sign bit.
1966   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1967   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1968 
1969   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1970 
1971   const auto Zero32 = B.buildConstant(S32, 0);
1972 
1973   // Extend back to 64-bits.
1974   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1975 
1976   auto Shr = B.buildAShr(S64, FractMask, Exp);
1977   auto Not = B.buildNot(S64, Shr);
1978   auto Tmp0 = B.buildAnd(S64, Src, Not);
1979   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1980 
1981   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1982   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1983 
1984   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1985   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1986   MI.eraseFromParent();
1987   return true;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::legalizeITOFP(
1991   MachineInstr &MI, MachineRegisterInfo &MRI,
1992   MachineIRBuilder &B, bool Signed) const {
1993 
1994   Register Dst = MI.getOperand(0).getReg();
1995   Register Src = MI.getOperand(1).getReg();
1996 
1997   const LLT S64 = LLT::scalar(64);
1998   const LLT S32 = LLT::scalar(32);
1999 
2000   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2001 
2002   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2003 
2004   auto CvtHi = Signed ?
2005     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2006     B.buildUITOFP(S64, Unmerge.getReg(1));
2007 
2008   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2009 
2010   auto ThirtyTwo = B.buildConstant(S32, 32);
2011   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2012     .addUse(CvtHi.getReg(0))
2013     .addUse(ThirtyTwo.getReg(0));
2014 
2015   // TODO: Should this propagate fast-math-flags?
2016   B.buildFAdd(Dst, LdExp, CvtLo);
2017   MI.eraseFromParent();
2018   return true;
2019 }
2020 
2021 // TODO: Copied from DAG implementation. Verify logic and document how this
2022 // actually works.
2023 bool AMDGPULegalizerInfo::legalizeFPTOI(
2024   MachineInstr &MI, MachineRegisterInfo &MRI,
2025   MachineIRBuilder &B, bool Signed) const {
2026 
2027   Register Dst = MI.getOperand(0).getReg();
2028   Register Src = MI.getOperand(1).getReg();
2029 
2030   const LLT S64 = LLT::scalar(64);
2031   const LLT S32 = LLT::scalar(32);
2032 
2033   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2034 
2035   unsigned Flags = MI.getFlags();
2036 
2037   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2038   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2039   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2040 
2041   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2042   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2043   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2044 
2045   auto Hi = Signed ?
2046     B.buildFPTOSI(S32, FloorMul) :
2047     B.buildFPTOUI(S32, FloorMul);
2048   auto Lo = B.buildFPTOUI(S32, Fma);
2049 
2050   B.buildMerge(Dst, { Lo, Hi });
2051   MI.eraseFromParent();
2052 
2053   return true;
2054 }
2055 
2056 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2057                                                MachineInstr &MI) const {
2058   MachineFunction &MF = Helper.MIRBuilder.getMF();
2059   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2060 
2061   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2062                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2063 
2064   // With ieee_mode disabled, the instructions have the correct behavior
2065   // already for G_FMINNUM/G_FMAXNUM
2066   if (!MFI->getMode().IEEE)
2067     return !IsIEEEOp;
2068 
2069   if (IsIEEEOp)
2070     return true;
2071 
2072   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2073 }
2074 
2075 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2076   MachineInstr &MI, MachineRegisterInfo &MRI,
2077   MachineIRBuilder &B) const {
2078   // TODO: Should move some of this into LegalizerHelper.
2079 
2080   // TODO: Promote dynamic indexing of s16 to s32
2081 
2082   // FIXME: Artifact combiner probably should have replaced the truncated
2083   // constant before this, so we shouldn't need
2084   // getConstantVRegValWithLookThrough.
2085   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2086     MI.getOperand(2).getReg(), MRI);
2087   if (!IdxVal) // Dynamic case will be selected to register indexing.
2088     return true;
2089 
2090   Register Dst = MI.getOperand(0).getReg();
2091   Register Vec = MI.getOperand(1).getReg();
2092 
2093   LLT VecTy = MRI.getType(Vec);
2094   LLT EltTy = VecTy.getElementType();
2095   assert(EltTy == MRI.getType(Dst));
2096 
2097   if (IdxVal->Value < VecTy.getNumElements())
2098     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2099   else
2100     B.buildUndef(Dst);
2101 
2102   MI.eraseFromParent();
2103   return true;
2104 }
2105 
2106 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2107   MachineInstr &MI, MachineRegisterInfo &MRI,
2108   MachineIRBuilder &B) const {
2109   // TODO: Should move some of this into LegalizerHelper.
2110 
2111   // TODO: Promote dynamic indexing of s16 to s32
2112 
2113   // FIXME: Artifact combiner probably should have replaced the truncated
2114   // constant before this, so we shouldn't need
2115   // getConstantVRegValWithLookThrough.
2116   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2117     MI.getOperand(3).getReg(), MRI);
2118   if (!IdxVal) // Dynamic case will be selected to register indexing.
2119     return true;
2120 
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register Vec = MI.getOperand(1).getReg();
2123   Register Ins = MI.getOperand(2).getReg();
2124 
2125   LLT VecTy = MRI.getType(Vec);
2126   LLT EltTy = VecTy.getElementType();
2127   assert(EltTy == MRI.getType(Ins));
2128 
2129   if (IdxVal->Value < VecTy.getNumElements())
2130     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2131   else
2132     B.buildUndef(Dst);
2133 
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2139   MachineInstr &MI, MachineRegisterInfo &MRI,
2140   MachineIRBuilder &B) const {
2141   const LLT V2S16 = LLT::vector(2, 16);
2142 
2143   Register Dst = MI.getOperand(0).getReg();
2144   Register Src0 = MI.getOperand(1).getReg();
2145   LLT DstTy = MRI.getType(Dst);
2146   LLT SrcTy = MRI.getType(Src0);
2147 
2148   if (SrcTy == V2S16 && DstTy == V2S16 &&
2149       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2150     return true;
2151 
2152   MachineIRBuilder HelperBuilder(MI);
2153   GISelObserverWrapper DummyObserver;
2154   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2155   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeSinCos(
2159   MachineInstr &MI, MachineRegisterInfo &MRI,
2160   MachineIRBuilder &B) const {
2161 
2162   Register DstReg = MI.getOperand(0).getReg();
2163   Register SrcReg = MI.getOperand(1).getReg();
2164   LLT Ty = MRI.getType(DstReg);
2165   unsigned Flags = MI.getFlags();
2166 
2167   Register TrigVal;
2168   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2169   if (ST.hasTrigReducedRange()) {
2170     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2171     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2172       .addUse(MulVal.getReg(0))
2173       .setMIFlags(Flags).getReg(0);
2174   } else
2175     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2176 
2177   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2178     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2179   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2180     .addUse(TrigVal)
2181     .setMIFlags(Flags);
2182   MI.eraseFromParent();
2183   return true;
2184 }
2185 
2186 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2187                                                   MachineIRBuilder &B,
2188                                                   const GlobalValue *GV,
2189                                                   int64_t Offset,
2190                                                   unsigned GAFlags) const {
2191   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2192   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2193   // to the following code sequence:
2194   //
2195   // For constant address space:
2196   //   s_getpc_b64 s[0:1]
2197   //   s_add_u32 s0, s0, $symbol
2198   //   s_addc_u32 s1, s1, 0
2199   //
2200   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2201   //   a fixup or relocation is emitted to replace $symbol with a literal
2202   //   constant, which is a pc-relative offset from the encoding of the $symbol
2203   //   operand to the global variable.
2204   //
2205   // For global address space:
2206   //   s_getpc_b64 s[0:1]
2207   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2208   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2209   //
2210   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2211   //   fixups or relocations are emitted to replace $symbol@*@lo and
2212   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2213   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2214   //   operand to the global variable.
2215   //
2216   // What we want here is an offset from the value returned by s_getpc
2217   // (which is the address of the s_add_u32 instruction) to the global
2218   // variable, but since the encoding of $symbol starts 4 bytes after the start
2219   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2220   // small. This requires us to add 4 to the global variable offset in order to
2221   // compute the correct address.
2222 
2223   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2224 
2225   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2226     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2227 
2228   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2229     .addDef(PCReg);
2230 
2231   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2232   if (GAFlags == SIInstrInfo::MO_NONE)
2233     MIB.addImm(0);
2234   else
2235     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2236 
2237   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2238 
2239   if (PtrTy.getSizeInBits() == 32)
2240     B.buildExtract(DstReg, PCReg, 0);
2241   return true;
2242  }
2243 
2244 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2245   MachineInstr &MI, MachineRegisterInfo &MRI,
2246   MachineIRBuilder &B) const {
2247   Register DstReg = MI.getOperand(0).getReg();
2248   LLT Ty = MRI.getType(DstReg);
2249   unsigned AS = Ty.getAddressSpace();
2250 
2251   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2252   MachineFunction &MF = B.getMF();
2253   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2254 
2255   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2256     if (!MFI->isEntryFunction()) {
2257       const Function &Fn = MF.getFunction();
2258       DiagnosticInfoUnsupported BadLDSDecl(
2259         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2260         DS_Warning);
2261       Fn.getContext().diagnose(BadLDSDecl);
2262 
2263       // We currently don't have a way to correctly allocate LDS objects that
2264       // aren't directly associated with a kernel. We do force inlining of
2265       // functions that use local objects. However, if these dead functions are
2266       // not eliminated, we don't want a compile time error. Just emit a warning
2267       // and a trap, since there should be no callable path here.
2268       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2269       B.buildUndef(DstReg);
2270       MI.eraseFromParent();
2271       return true;
2272     }
2273 
2274     // TODO: We could emit code to handle the initialization somewhere.
2275     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2276       const SITargetLowering *TLI = ST.getTargetLowering();
2277       if (!TLI->shouldUseLDSConstAddress(GV)) {
2278         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2279         return true; // Leave in place;
2280       }
2281 
2282       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2283         Type *Ty = GV->getValueType();
2284         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2285         // zero-sized type in other languages to declare the dynamic shared
2286         // memory which size is not known at the compile time. They will be
2287         // allocated by the runtime and placed directly after the static
2288         // allocated ones. They all share the same offset.
2289         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2290           // Adjust alignment for that dynamic shared memory array.
2291           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2292           LLT S32 = LLT::scalar(32);
2293           auto Sz =
2294               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2295           B.buildIntToPtr(DstReg, Sz);
2296           MI.eraseFromParent();
2297           return true;
2298         }
2299       }
2300 
2301       B.buildConstant(
2302           DstReg,
2303           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2304       MI.eraseFromParent();
2305       return true;
2306     }
2307 
2308     const Function &Fn = MF.getFunction();
2309     DiagnosticInfoUnsupported BadInit(
2310       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2311     Fn.getContext().diagnose(BadInit);
2312     return true;
2313   }
2314 
2315   const SITargetLowering *TLI = ST.getTargetLowering();
2316 
2317   if (TLI->shouldEmitFixup(GV)) {
2318     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2319     MI.eraseFromParent();
2320     return true;
2321   }
2322 
2323   if (TLI->shouldEmitPCReloc(GV)) {
2324     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2325     MI.eraseFromParent();
2326     return true;
2327   }
2328 
2329   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2330   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2331 
2332   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2333       MachinePointerInfo::getGOT(MF),
2334       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2335           MachineMemOperand::MOInvariant,
2336       8 /*Size*/, Align(8));
2337 
2338   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2339 
2340   if (Ty.getSizeInBits() == 32) {
2341     // Truncate if this is a 32-bit constant adrdess.
2342     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2343     B.buildExtract(DstReg, Load, 0);
2344   } else
2345     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2346 
2347   MI.eraseFromParent();
2348   return true;
2349 }
2350 
2351 static LLT widenToNextPowerOf2(LLT Ty) {
2352   if (Ty.isVector())
2353     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2354   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2355 }
2356 
2357 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2358                                        MachineInstr &MI) const {
2359   MachineIRBuilder &B = Helper.MIRBuilder;
2360   MachineRegisterInfo &MRI = *B.getMRI();
2361   GISelChangeObserver &Observer = Helper.Observer;
2362 
2363   Register PtrReg = MI.getOperand(1).getReg();
2364   LLT PtrTy = MRI.getType(PtrReg);
2365   unsigned AddrSpace = PtrTy.getAddressSpace();
2366 
2367   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2368     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2369     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2370     Observer.changingInstr(MI);
2371     MI.getOperand(1).setReg(Cast.getReg(0));
2372     Observer.changedInstr(MI);
2373     return true;
2374   }
2375 
2376   Register ValReg = MI.getOperand(0).getReg();
2377   LLT ValTy = MRI.getType(ValReg);
2378 
2379   MachineMemOperand *MMO = *MI.memoperands_begin();
2380   const unsigned ValSize = ValTy.getSizeInBits();
2381   const unsigned MemSize = 8 * MMO->getSize();
2382   const Align MemAlign = MMO->getAlign();
2383   const unsigned AlignInBits = 8 * MemAlign.value();
2384 
2385   // Widen non-power-of-2 loads to the alignment if needed
2386   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2387     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2388 
2389     // This was already the correct extending load result type, so just adjust
2390     // the memory type.
2391     if (WideMemSize == ValSize) {
2392       MachineFunction &MF = B.getMF();
2393 
2394       MachineMemOperand *WideMMO =
2395           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2396       Observer.changingInstr(MI);
2397       MI.setMemRefs(MF, {WideMMO});
2398       Observer.changedInstr(MI);
2399       return true;
2400     }
2401 
2402     // Don't bother handling edge case that should probably never be produced.
2403     if (ValSize > WideMemSize)
2404       return false;
2405 
2406     LLT WideTy = widenToNextPowerOf2(ValTy);
2407 
2408     Register WideLoad;
2409     if (!WideTy.isVector()) {
2410       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2411       B.buildTrunc(ValReg, WideLoad).getReg(0);
2412     } else {
2413       // Extract the subvector.
2414 
2415       if (isRegisterType(ValTy)) {
2416         // If this a case where G_EXTRACT is legal, use it.
2417         // (e.g. <3 x s32> -> <4 x s32>)
2418         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2419         B.buildExtract(ValReg, WideLoad, 0);
2420       } else {
2421         // For cases where the widened type isn't a nice register value, unmerge
2422         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2423         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2424         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2425         B.setInsertPt(B.getMBB(), MI.getIterator());
2426         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2427       }
2428     }
2429 
2430     MI.eraseFromParent();
2431     return true;
2432   }
2433 
2434   return false;
2435 }
2436 
2437 bool AMDGPULegalizerInfo::legalizeFMad(
2438   MachineInstr &MI, MachineRegisterInfo &MRI,
2439   MachineIRBuilder &B) const {
2440   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2441   assert(Ty.isScalar());
2442 
2443   MachineFunction &MF = B.getMF();
2444   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2445 
2446   // TODO: Always legal with future ftz flag.
2447   // FIXME: Do we need just output?
2448   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2449     return true;
2450   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2451     return true;
2452 
2453   MachineIRBuilder HelperBuilder(MI);
2454   GISelObserverWrapper DummyObserver;
2455   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2456   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2457 }
2458 
2459 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2460   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2461   Register DstReg = MI.getOperand(0).getReg();
2462   Register PtrReg = MI.getOperand(1).getReg();
2463   Register CmpVal = MI.getOperand(2).getReg();
2464   Register NewVal = MI.getOperand(3).getReg();
2465 
2466   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2467          "this should not have been custom lowered");
2468 
2469   LLT ValTy = MRI.getType(CmpVal);
2470   LLT VecTy = LLT::vector(2, ValTy);
2471 
2472   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2473 
2474   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2475     .addDef(DstReg)
2476     .addUse(PtrReg)
2477     .addUse(PackedVal)
2478     .setMemRefs(MI.memoperands());
2479 
2480   MI.eraseFromParent();
2481   return true;
2482 }
2483 
2484 bool AMDGPULegalizerInfo::legalizeFlog(
2485   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2486   Register Dst = MI.getOperand(0).getReg();
2487   Register Src = MI.getOperand(1).getReg();
2488   LLT Ty = B.getMRI()->getType(Dst);
2489   unsigned Flags = MI.getFlags();
2490 
2491   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2492   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2493 
2494   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2495   MI.eraseFromParent();
2496   return true;
2497 }
2498 
2499 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2500                                        MachineIRBuilder &B) const {
2501   Register Dst = MI.getOperand(0).getReg();
2502   Register Src = MI.getOperand(1).getReg();
2503   unsigned Flags = MI.getFlags();
2504   LLT Ty = B.getMRI()->getType(Dst);
2505 
2506   auto K = B.buildFConstant(Ty, numbers::log2e);
2507   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2508   B.buildFExp2(Dst, Mul, Flags);
2509   MI.eraseFromParent();
2510   return true;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2514                                        MachineIRBuilder &B) const {
2515   Register Dst = MI.getOperand(0).getReg();
2516   Register Src0 = MI.getOperand(1).getReg();
2517   Register Src1 = MI.getOperand(2).getReg();
2518   unsigned Flags = MI.getFlags();
2519   LLT Ty = B.getMRI()->getType(Dst);
2520   const LLT S16 = LLT::scalar(16);
2521   const LLT S32 = LLT::scalar(32);
2522 
2523   if (Ty == S32) {
2524     auto Log = B.buildFLog2(S32, Src0, Flags);
2525     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2526       .addUse(Log.getReg(0))
2527       .addUse(Src1)
2528       .setMIFlags(Flags);
2529     B.buildFExp2(Dst, Mul, Flags);
2530   } else if (Ty == S16) {
2531     // There's no f16 fmul_legacy, so we need to convert for it.
2532     auto Log = B.buildFLog2(S16, Src0, Flags);
2533     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2534     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2535     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2536       .addUse(Ext0.getReg(0))
2537       .addUse(Ext1.getReg(0))
2538       .setMIFlags(Flags);
2539 
2540     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2541   } else
2542     return false;
2543 
2544   MI.eraseFromParent();
2545   return true;
2546 }
2547 
2548 // Find a source register, ignoring any possible source modifiers.
2549 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2550   Register ModSrc = OrigSrc;
2551   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2552     ModSrc = SrcFNeg->getOperand(1).getReg();
2553     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2554       ModSrc = SrcFAbs->getOperand(1).getReg();
2555   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2556     ModSrc = SrcFAbs->getOperand(1).getReg();
2557   return ModSrc;
2558 }
2559 
2560 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2561                                          MachineRegisterInfo &MRI,
2562                                          MachineIRBuilder &B) const {
2563 
2564   const LLT S1 = LLT::scalar(1);
2565   const LLT S64 = LLT::scalar(64);
2566   Register Dst = MI.getOperand(0).getReg();
2567   Register OrigSrc = MI.getOperand(1).getReg();
2568   unsigned Flags = MI.getFlags();
2569   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2570          "this should not have been custom lowered");
2571 
2572   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2573   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2574   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2575   // V_FRACT bug is:
2576   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2577   //
2578   // Convert floor(x) to (x - fract(x))
2579 
2580   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2581     .addUse(OrigSrc)
2582     .setMIFlags(Flags);
2583 
2584   // Give source modifier matching some assistance before obscuring a foldable
2585   // pattern.
2586 
2587   // TODO: We can avoid the neg on the fract? The input sign to fract
2588   // shouldn't matter?
2589   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2590 
2591   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2592 
2593   Register Min = MRI.createGenericVirtualRegister(S64);
2594 
2595   // We don't need to concern ourselves with the snan handling difference, so
2596   // use the one which will directly select.
2597   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2598   if (MFI->getMode().IEEE)
2599     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2600   else
2601     B.buildFMinNum(Min, Fract, Const, Flags);
2602 
2603   Register CorrectedFract = Min;
2604   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2605     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2606     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2607   }
2608 
2609   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2610   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2611 
2612   MI.eraseFromParent();
2613   return true;
2614 }
2615 
2616 // Turn an illegal packed v2s16 build vector into bit operations.
2617 // TODO: This should probably be a bitcast action in LegalizerHelper.
2618 bool AMDGPULegalizerInfo::legalizeBuildVector(
2619   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2620   Register Dst = MI.getOperand(0).getReg();
2621   const LLT S32 = LLT::scalar(32);
2622   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2623 
2624   Register Src0 = MI.getOperand(1).getReg();
2625   Register Src1 = MI.getOperand(2).getReg();
2626   assert(MRI.getType(Src0) == LLT::scalar(16));
2627 
2628   auto Merge = B.buildMerge(S32, {Src0, Src1});
2629   B.buildBitcast(Dst, Merge);
2630 
2631   MI.eraseFromParent();
2632   return true;
2633 }
2634 
2635 // Return the use branch instruction, otherwise null if the usage is invalid.
2636 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2637                                        MachineRegisterInfo &MRI,
2638                                        MachineInstr *&Br,
2639                                        MachineBasicBlock *&UncondBrTarget) {
2640   Register CondDef = MI.getOperand(0).getReg();
2641   if (!MRI.hasOneNonDBGUse(CondDef))
2642     return nullptr;
2643 
2644   MachineBasicBlock *Parent = MI.getParent();
2645   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2646   if (UseMI.getParent() != Parent ||
2647       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2648     return nullptr;
2649 
2650   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2651   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2652   if (Next == Parent->end()) {
2653     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2654     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2655       return nullptr;
2656     UncondBrTarget = &*NextMBB;
2657   } else {
2658     if (Next->getOpcode() != AMDGPU::G_BR)
2659       return nullptr;
2660     Br = &*Next;
2661     UncondBrTarget = Br->getOperand(0).getMBB();
2662   }
2663 
2664   return &UseMI;
2665 }
2666 
2667 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2668                                          const ArgDescriptor *Arg,
2669                                          const TargetRegisterClass *ArgRC,
2670                                          LLT ArgTy) const {
2671   MCRegister SrcReg = Arg->getRegister();
2672   assert(SrcReg.isPhysical() && "Physical register expected");
2673   assert(DstReg.isVirtual() && "Virtual register expected");
2674 
2675   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2676                                              ArgTy);
2677   if (Arg->isMasked()) {
2678     // TODO: Should we try to emit this once in the entry block?
2679     const LLT S32 = LLT::scalar(32);
2680     const unsigned Mask = Arg->getMask();
2681     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2682 
2683     Register AndMaskSrc = LiveIn;
2684 
2685     if (Shift != 0) {
2686       auto ShiftAmt = B.buildConstant(S32, Shift);
2687       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2688     }
2689 
2690     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2691   } else {
2692     B.buildCopy(DstReg, LiveIn);
2693   }
2694 
2695   return true;
2696 }
2697 
2698 bool AMDGPULegalizerInfo::loadInputValue(
2699     Register DstReg, MachineIRBuilder &B,
2700     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2701   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2702   const ArgDescriptor *Arg;
2703   const TargetRegisterClass *ArgRC;
2704   LLT ArgTy;
2705   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2706 
2707   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2708     return false; // TODO: Handle these
2709   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2713     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2714     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2715   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2716     return false;
2717 
2718   MI.eraseFromParent();
2719   return true;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2723                                        MachineRegisterInfo &MRI,
2724                                        MachineIRBuilder &B) const {
2725   Register Dst = MI.getOperand(0).getReg();
2726   LLT DstTy = MRI.getType(Dst);
2727   LLT S16 = LLT::scalar(16);
2728   LLT S32 = LLT::scalar(32);
2729   LLT S64 = LLT::scalar(64);
2730 
2731   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2732     return true;
2733 
2734   if (DstTy == S16)
2735     return legalizeFDIV16(MI, MRI, B);
2736   if (DstTy == S32)
2737     return legalizeFDIV32(MI, MRI, B);
2738   if (DstTy == S64)
2739     return legalizeFDIV64(MI, MRI, B);
2740 
2741   return false;
2742 }
2743 
2744 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2745                                                   Register DstReg,
2746                                                   Register X,
2747                                                   Register Y,
2748                                                   bool IsDiv) const {
2749   const LLT S1 = LLT::scalar(1);
2750   const LLT S32 = LLT::scalar(32);
2751 
2752   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2753   // algorithm used here.
2754 
2755   // Initial estimate of inv(y).
2756   auto FloatY = B.buildUITOFP(S32, Y);
2757   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2758   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2759   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2760   auto Z = B.buildFPTOUI(S32, ScaledY);
2761 
2762   // One round of UNR.
2763   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2764   auto NegYZ = B.buildMul(S32, NegY, Z);
2765   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2766 
2767   // Quotient/remainder estimate.
2768   auto Q = B.buildUMulH(S32, X, Z);
2769   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2770 
2771   // First quotient/remainder refinement.
2772   auto One = B.buildConstant(S32, 1);
2773   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2774   if (IsDiv)
2775     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2776   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2777 
2778   // Second quotient/remainder refinement.
2779   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2780   if (IsDiv)
2781     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2782   else
2783     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2784 }
2785 
2786 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2787                                               MachineRegisterInfo &MRI,
2788                                               MachineIRBuilder &B) const {
2789   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2790   Register DstReg = MI.getOperand(0).getReg();
2791   Register Num = MI.getOperand(1).getReg();
2792   Register Den = MI.getOperand(2).getReg();
2793   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2794   MI.eraseFromParent();
2795   return true;
2796 }
2797 
2798 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2799 //
2800 // Return lo, hi of result
2801 //
2802 // %cvt.lo = G_UITOFP Val.lo
2803 // %cvt.hi = G_UITOFP Val.hi
2804 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2805 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2806 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2807 // %mul2 = G_FMUL %mul1, 2**(-32)
2808 // %trunc = G_INTRINSIC_TRUNC %mul2
2809 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2810 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2811 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2812                                                        Register Val) {
2813   const LLT S32 = LLT::scalar(32);
2814   auto Unmerge = B.buildUnmerge(S32, Val);
2815 
2816   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2817   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2818 
2819   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2820                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2821 
2822   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2823   auto Mul1 =
2824       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2825 
2826   // 2**(-32)
2827   auto Mul2 =
2828       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2829   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2830 
2831   // -(2**32)
2832   auto Mad2 = B.buildFMAD(S32, Trunc,
2833                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2834 
2835   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2836   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2837 
2838   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2839 }
2840 
2841 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2842                                                   Register DstReg,
2843                                                   Register Numer,
2844                                                   Register Denom,
2845                                                   bool IsDiv) const {
2846   const LLT S32 = LLT::scalar(32);
2847   const LLT S64 = LLT::scalar(64);
2848   const LLT S1 = LLT::scalar(1);
2849   Register RcpLo, RcpHi;
2850 
2851   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2852 
2853   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2854 
2855   auto Zero64 = B.buildConstant(S64, 0);
2856   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2857 
2858   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2859   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2860 
2861   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2862   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2863   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2864 
2865   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2866   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2867   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2868   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2869 
2870   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2871   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2872   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2873   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2874   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2875 
2876   auto Zero32 = B.buildConstant(S32, 0);
2877   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2878   auto Add2_HiC =
2879       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2880   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2881   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2882 
2883   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2884   Register NumerLo = UnmergeNumer.getReg(0);
2885   Register NumerHi = UnmergeNumer.getReg(1);
2886 
2887   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2888   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2889   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2890   Register Mul3_Lo = UnmergeMul3.getReg(0);
2891   Register Mul3_Hi = UnmergeMul3.getReg(1);
2892   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2893   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2894   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2895   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2896 
2897   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2898   Register DenomLo = UnmergeDenom.getReg(0);
2899   Register DenomHi = UnmergeDenom.getReg(1);
2900 
2901   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2902   auto C1 = B.buildSExt(S32, CmpHi);
2903 
2904   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2905   auto C2 = B.buildSExt(S32, CmpLo);
2906 
2907   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2908   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2909 
2910   // TODO: Here and below portions of the code can be enclosed into if/endif.
2911   // Currently control flow is unconditional and we have 4 selects after
2912   // potential endif to substitute PHIs.
2913 
2914   // if C3 != 0 ...
2915   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2916   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2917   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2918   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2919 
2920   auto One64 = B.buildConstant(S64, 1);
2921   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2922 
2923   auto C4 =
2924       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2925   auto C5 =
2926       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2927   auto C6 = B.buildSelect(
2928       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2929 
2930   // if (C6 != 0)
2931   auto Add4 = B.buildAdd(S64, Add3, One64);
2932   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2933 
2934   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2935   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2936   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2937 
2938   // endif C6
2939   // endif C3
2940 
2941   if (IsDiv) {
2942     auto Sel1 = B.buildSelect(
2943         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2944     B.buildSelect(DstReg,
2945                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2946   } else {
2947     auto Sel2 = B.buildSelect(
2948         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2949     B.buildSelect(DstReg,
2950                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2951   }
2952 }
2953 
2954 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2955                                             MachineRegisterInfo &MRI,
2956                                             MachineIRBuilder &B) const {
2957   const LLT S64 = LLT::scalar(64);
2958   const LLT S32 = LLT::scalar(32);
2959   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2960   Register DstReg = MI.getOperand(0).getReg();
2961   Register Num = MI.getOperand(1).getReg();
2962   Register Den = MI.getOperand(2).getReg();
2963   LLT Ty = MRI.getType(DstReg);
2964 
2965   if (Ty == S32)
2966     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2967   else if (Ty == S64)
2968     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2969   else
2970     return false;
2971 
2972   MI.eraseFromParent();
2973   return true;
2974 
2975 }
2976 
2977 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2978                                             MachineRegisterInfo &MRI,
2979                                             MachineIRBuilder &B) const {
2980   const LLT S64 = LLT::scalar(64);
2981   const LLT S32 = LLT::scalar(32);
2982 
2983   Register DstReg = MI.getOperand(0).getReg();
2984   const LLT Ty = MRI.getType(DstReg);
2985   if (Ty != S32 && Ty != S64)
2986     return false;
2987 
2988   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2989 
2990   Register LHS = MI.getOperand(1).getReg();
2991   Register RHS = MI.getOperand(2).getReg();
2992 
2993   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2994   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2995   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2996 
2997   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2998   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2999 
3000   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3001   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3002 
3003   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3004   if (Ty == S32)
3005     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3006   else
3007     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3008 
3009   Register Sign;
3010   if (IsDiv)
3011     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3012   else
3013     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3014 
3015   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3016   B.buildSub(DstReg, UDivRem, Sign);
3017 
3018   MI.eraseFromParent();
3019   return true;
3020 }
3021 
3022 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3023                                                  MachineRegisterInfo &MRI,
3024                                                  MachineIRBuilder &B) const {
3025   Register Res = MI.getOperand(0).getReg();
3026   Register LHS = MI.getOperand(1).getReg();
3027   Register RHS = MI.getOperand(2).getReg();
3028 
3029   uint16_t Flags = MI.getFlags();
3030 
3031   LLT ResTy = MRI.getType(Res);
3032   LLT S32 = LLT::scalar(32);
3033   LLT S64 = LLT::scalar(64);
3034 
3035   const MachineFunction &MF = B.getMF();
3036   bool Unsafe =
3037     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3038 
3039   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3040     return false;
3041 
3042   if (!Unsafe && ResTy == S32 &&
3043       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3044     return false;
3045 
3046   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3047     // 1 / x -> RCP(x)
3048     if (CLHS->isExactlyValue(1.0)) {
3049       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3050         .addUse(RHS)
3051         .setMIFlags(Flags);
3052 
3053       MI.eraseFromParent();
3054       return true;
3055     }
3056 
3057     // -1 / x -> RCP( FNEG(x) )
3058     if (CLHS->isExactlyValue(-1.0)) {
3059       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3060       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3061         .addUse(FNeg.getReg(0))
3062         .setMIFlags(Flags);
3063 
3064       MI.eraseFromParent();
3065       return true;
3066     }
3067   }
3068 
3069   // x / y -> x * (1.0 / y)
3070   if (Unsafe) {
3071     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3072       .addUse(RHS)
3073       .setMIFlags(Flags);
3074     B.buildFMul(Res, LHS, RCP, Flags);
3075 
3076     MI.eraseFromParent();
3077     return true;
3078   }
3079 
3080   return false;
3081 }
3082 
3083 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3084                                          MachineRegisterInfo &MRI,
3085                                          MachineIRBuilder &B) const {
3086   Register Res = MI.getOperand(0).getReg();
3087   Register LHS = MI.getOperand(1).getReg();
3088   Register RHS = MI.getOperand(2).getReg();
3089 
3090   uint16_t Flags = MI.getFlags();
3091 
3092   LLT S16 = LLT::scalar(16);
3093   LLT S32 = LLT::scalar(32);
3094 
3095   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3096   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3097 
3098   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3099     .addUse(RHSExt.getReg(0))
3100     .setMIFlags(Flags);
3101 
3102   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3103   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3104 
3105   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3106     .addUse(RDst.getReg(0))
3107     .addUse(RHS)
3108     .addUse(LHS)
3109     .setMIFlags(Flags);
3110 
3111   MI.eraseFromParent();
3112   return true;
3113 }
3114 
3115 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3116 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3117 static void toggleSPDenormMode(bool Enable,
3118                                MachineIRBuilder &B,
3119                                const GCNSubtarget &ST,
3120                                AMDGPU::SIModeRegisterDefaults Mode) {
3121   // Set SP denorm mode to this value.
3122   unsigned SPDenormMode =
3123     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3124 
3125   if (ST.hasDenormModeInst()) {
3126     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3127     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3128 
3129     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3130     B.buildInstr(AMDGPU::S_DENORM_MODE)
3131       .addImm(NewDenormModeValue);
3132 
3133   } else {
3134     // Select FP32 bit field in mode register.
3135     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3136                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3137                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3138 
3139     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3140       .addImm(SPDenormMode)
3141       .addImm(SPDenormModeBitField);
3142   }
3143 }
3144 
3145 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3146                                          MachineRegisterInfo &MRI,
3147                                          MachineIRBuilder &B) const {
3148   Register Res = MI.getOperand(0).getReg();
3149   Register LHS = MI.getOperand(1).getReg();
3150   Register RHS = MI.getOperand(2).getReg();
3151   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3152   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3153 
3154   uint16_t Flags = MI.getFlags();
3155 
3156   LLT S32 = LLT::scalar(32);
3157   LLT S1 = LLT::scalar(1);
3158 
3159   auto One = B.buildFConstant(S32, 1.0f);
3160 
3161   auto DenominatorScaled =
3162     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3163       .addUse(LHS)
3164       .addUse(RHS)
3165       .addImm(0)
3166       .setMIFlags(Flags);
3167   auto NumeratorScaled =
3168     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3169       .addUse(LHS)
3170       .addUse(RHS)
3171       .addImm(1)
3172       .setMIFlags(Flags);
3173 
3174   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3175     .addUse(DenominatorScaled.getReg(0))
3176     .setMIFlags(Flags);
3177   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3178 
3179   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3180   // aren't modeled as reading it.
3181   if (!Mode.allFP32Denormals())
3182     toggleSPDenormMode(true, B, ST, Mode);
3183 
3184   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3185   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3186   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3187   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3188   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3189   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3190 
3191   if (!Mode.allFP32Denormals())
3192     toggleSPDenormMode(false, B, ST, Mode);
3193 
3194   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3195     .addUse(Fma4.getReg(0))
3196     .addUse(Fma1.getReg(0))
3197     .addUse(Fma3.getReg(0))
3198     .addUse(NumeratorScaled.getReg(1))
3199     .setMIFlags(Flags);
3200 
3201   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3202     .addUse(Fmas.getReg(0))
3203     .addUse(RHS)
3204     .addUse(LHS)
3205     .setMIFlags(Flags);
3206 
3207   MI.eraseFromParent();
3208   return true;
3209 }
3210 
3211 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3212                                          MachineRegisterInfo &MRI,
3213                                          MachineIRBuilder &B) const {
3214   Register Res = MI.getOperand(0).getReg();
3215   Register LHS = MI.getOperand(1).getReg();
3216   Register RHS = MI.getOperand(2).getReg();
3217 
3218   uint16_t Flags = MI.getFlags();
3219 
3220   LLT S64 = LLT::scalar(64);
3221   LLT S1 = LLT::scalar(1);
3222 
3223   auto One = B.buildFConstant(S64, 1.0);
3224 
3225   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3226     .addUse(LHS)
3227     .addUse(RHS)
3228     .addImm(0)
3229     .setMIFlags(Flags);
3230 
3231   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3232 
3233   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3234     .addUse(DivScale0.getReg(0))
3235     .setMIFlags(Flags);
3236 
3237   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3238   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3239   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3240 
3241   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3242     .addUse(LHS)
3243     .addUse(RHS)
3244     .addImm(1)
3245     .setMIFlags(Flags);
3246 
3247   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3248   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3249   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3250 
3251   Register Scale;
3252   if (!ST.hasUsableDivScaleConditionOutput()) {
3253     // Workaround a hardware bug on SI where the condition output from div_scale
3254     // is not usable.
3255 
3256     LLT S32 = LLT::scalar(32);
3257 
3258     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3259     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3260     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3261     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3262 
3263     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3264                               Scale1Unmerge.getReg(1));
3265     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3266                               Scale0Unmerge.getReg(1));
3267     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3268   } else {
3269     Scale = DivScale1.getReg(1);
3270   }
3271 
3272   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3273     .addUse(Fma4.getReg(0))
3274     .addUse(Fma3.getReg(0))
3275     .addUse(Mul.getReg(0))
3276     .addUse(Scale)
3277     .setMIFlags(Flags);
3278 
3279   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3280     .addUse(Fmas.getReg(0))
3281     .addUse(RHS)
3282     .addUse(LHS)
3283     .setMIFlags(Flags);
3284 
3285   MI.eraseFromParent();
3286   return true;
3287 }
3288 
3289 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3290                                                  MachineRegisterInfo &MRI,
3291                                                  MachineIRBuilder &B) const {
3292   Register Res = MI.getOperand(0).getReg();
3293   Register LHS = MI.getOperand(2).getReg();
3294   Register RHS = MI.getOperand(3).getReg();
3295   uint16_t Flags = MI.getFlags();
3296 
3297   LLT S32 = LLT::scalar(32);
3298   LLT S1 = LLT::scalar(1);
3299 
3300   auto Abs = B.buildFAbs(S32, RHS, Flags);
3301   const APFloat C0Val(1.0f);
3302 
3303   auto C0 = B.buildConstant(S32, 0x6f800000);
3304   auto C1 = B.buildConstant(S32, 0x2f800000);
3305   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3306 
3307   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3308   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3309 
3310   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3311 
3312   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3313     .addUse(Mul0.getReg(0))
3314     .setMIFlags(Flags);
3315 
3316   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3317 
3318   B.buildFMul(Res, Sel, Mul1, Flags);
3319 
3320   MI.eraseFromParent();
3321   return true;
3322 }
3323 
3324 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3325 // FIXME: Why do we handle this one but not other removed instructions?
3326 //
3327 // Reciprocal square root.  The clamp prevents infinite results, clamping
3328 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3329 // +-max_float.
3330 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3331                                                     MachineRegisterInfo &MRI,
3332                                                     MachineIRBuilder &B) const {
3333   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3334     return true;
3335 
3336   Register Dst = MI.getOperand(0).getReg();
3337   Register Src = MI.getOperand(2).getReg();
3338   auto Flags = MI.getFlags();
3339 
3340   LLT Ty = MRI.getType(Dst);
3341 
3342   const fltSemantics *FltSemantics;
3343   if (Ty == LLT::scalar(32))
3344     FltSemantics = &APFloat::IEEEsingle();
3345   else if (Ty == LLT::scalar(64))
3346     FltSemantics = &APFloat::IEEEdouble();
3347   else
3348     return false;
3349 
3350   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3351     .addUse(Src)
3352     .setMIFlags(Flags);
3353 
3354   // We don't need to concern ourselves with the snan handling difference, since
3355   // the rsq quieted (or not) so use the one which will directly select.
3356   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3357   const bool UseIEEE = MFI->getMode().IEEE;
3358 
3359   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3360   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3361                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3362 
3363   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3364 
3365   if (UseIEEE)
3366     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3367   else
3368     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3369   MI.eraseFromParent();
3370   return true;
3371 }
3372 
3373 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3374   switch (IID) {
3375   case Intrinsic::amdgcn_ds_fadd:
3376     return AMDGPU::G_ATOMICRMW_FADD;
3377   case Intrinsic::amdgcn_ds_fmin:
3378     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3379   case Intrinsic::amdgcn_ds_fmax:
3380     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3381   default:
3382     llvm_unreachable("not a DS FP intrinsic");
3383   }
3384 }
3385 
3386 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3387                                                       MachineInstr &MI,
3388                                                       Intrinsic::ID IID) const {
3389   GISelChangeObserver &Observer = Helper.Observer;
3390   Observer.changingInstr(MI);
3391 
3392   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3393 
3394   // The remaining operands were used to set fields in the MemOperand on
3395   // construction.
3396   for (int I = 6; I > 3; --I)
3397     MI.RemoveOperand(I);
3398 
3399   MI.RemoveOperand(1); // Remove the intrinsic ID.
3400   Observer.changedInstr(MI);
3401   return true;
3402 }
3403 
3404 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3405                                             MachineRegisterInfo &MRI,
3406                                             MachineIRBuilder &B) const {
3407   uint64_t Offset =
3408     ST.getTargetLowering()->getImplicitParameterOffset(
3409       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3410   LLT DstTy = MRI.getType(DstReg);
3411   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3412 
3413   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3414   if (!loadInputValue(KernargPtrReg, B,
3415                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3416     return false;
3417 
3418   // FIXME: This should be nuw
3419   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3420   return true;
3421 }
3422 
3423 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3424                                                  MachineRegisterInfo &MRI,
3425                                                  MachineIRBuilder &B) const {
3426   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3427   if (!MFI->isEntryFunction()) {
3428     return legalizePreloadedArgIntrin(MI, MRI, B,
3429                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3430   }
3431 
3432   Register DstReg = MI.getOperand(0).getReg();
3433   if (!getImplicitArgPtr(DstReg, MRI, B))
3434     return false;
3435 
3436   MI.eraseFromParent();
3437   return true;
3438 }
3439 
3440 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3441                                               MachineRegisterInfo &MRI,
3442                                               MachineIRBuilder &B,
3443                                               unsigned AddrSpace) const {
3444   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3445   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3446   Register Hi32 = Unmerge.getReg(1);
3447 
3448   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3449   MI.eraseFromParent();
3450   return true;
3451 }
3452 
3453 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3454 // offset (the offset that is included in bounds checking and swizzling, to be
3455 // split between the instruction's voffset and immoffset fields) and soffset
3456 // (the offset that is excluded from bounds checking and swizzling, to go in
3457 // the instruction's soffset field).  This function takes the first kind of
3458 // offset and figures out how to split it between voffset and immoffset.
3459 std::tuple<Register, unsigned, unsigned>
3460 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3461                                         Register OrigOffset) const {
3462   const unsigned MaxImm = 4095;
3463   Register BaseReg;
3464   unsigned TotalConstOffset;
3465   MachineInstr *OffsetDef;
3466   const LLT S32 = LLT::scalar(32);
3467 
3468   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3469     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3470 
3471   unsigned ImmOffset = TotalConstOffset;
3472 
3473   // If the immediate value is too big for the immoffset field, put the value
3474   // and -4096 into the immoffset field so that the value that is copied/added
3475   // for the voffset field is a multiple of 4096, and it stands more chance
3476   // of being CSEd with the copy/add for another similar load/store.
3477   // However, do not do that rounding down to a multiple of 4096 if that is a
3478   // negative number, as it appears to be illegal to have a negative offset
3479   // in the vgpr, even if adding the immediate offset makes it positive.
3480   unsigned Overflow = ImmOffset & ~MaxImm;
3481   ImmOffset -= Overflow;
3482   if ((int32_t)Overflow < 0) {
3483     Overflow += ImmOffset;
3484     ImmOffset = 0;
3485   }
3486 
3487   if (Overflow != 0) {
3488     if (!BaseReg) {
3489       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3490     } else {
3491       auto OverflowVal = B.buildConstant(S32, Overflow);
3492       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3493     }
3494   }
3495 
3496   if (!BaseReg)
3497     BaseReg = B.buildConstant(S32, 0).getReg(0);
3498 
3499   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3500 }
3501 
3502 /// Handle register layout difference for f16 images for some subtargets.
3503 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3504                                              MachineRegisterInfo &MRI,
3505                                              Register Reg) const {
3506   if (!ST.hasUnpackedD16VMem())
3507     return Reg;
3508 
3509   const LLT S16 = LLT::scalar(16);
3510   const LLT S32 = LLT::scalar(32);
3511   LLT StoreVT = MRI.getType(Reg);
3512   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3513 
3514   auto Unmerge = B.buildUnmerge(S16, Reg);
3515 
3516   SmallVector<Register, 4> WideRegs;
3517   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3518     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3519 
3520   int NumElts = StoreVT.getNumElements();
3521 
3522   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3523 }
3524 
3525 Register AMDGPULegalizerInfo::fixStoreSourceType(
3526   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3527   MachineRegisterInfo *MRI = B.getMRI();
3528   LLT Ty = MRI->getType(VData);
3529 
3530   const LLT S16 = LLT::scalar(16);
3531 
3532   // Fixup illegal register types for i8 stores.
3533   if (Ty == LLT::scalar(8) || Ty == S16) {
3534     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3535     return AnyExt;
3536   }
3537 
3538   if (Ty.isVector()) {
3539     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3540       if (IsFormat)
3541         return handleD16VData(B, *MRI, VData);
3542     }
3543   }
3544 
3545   return VData;
3546 }
3547 
3548 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3549                                               MachineRegisterInfo &MRI,
3550                                               MachineIRBuilder &B,
3551                                               bool IsTyped,
3552                                               bool IsFormat) const {
3553   Register VData = MI.getOperand(1).getReg();
3554   LLT Ty = MRI.getType(VData);
3555   LLT EltTy = Ty.getScalarType();
3556   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3557   const LLT S32 = LLT::scalar(32);
3558 
3559   VData = fixStoreSourceType(B, VData, IsFormat);
3560   Register RSrc = MI.getOperand(2).getReg();
3561 
3562   MachineMemOperand *MMO = *MI.memoperands_begin();
3563   const int MemSize = MMO->getSize();
3564 
3565   unsigned ImmOffset;
3566   unsigned TotalOffset;
3567 
3568   // The typed intrinsics add an immediate after the registers.
3569   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3570 
3571   // The struct intrinsic variants add one additional operand over raw.
3572   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3573   Register VIndex;
3574   int OpOffset = 0;
3575   if (HasVIndex) {
3576     VIndex = MI.getOperand(3).getReg();
3577     OpOffset = 1;
3578   }
3579 
3580   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3581   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3582 
3583   unsigned Format = 0;
3584   if (IsTyped) {
3585     Format = MI.getOperand(5 + OpOffset).getImm();
3586     ++OpOffset;
3587   }
3588 
3589   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3590 
3591   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3592   if (TotalOffset != 0)
3593     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3594 
3595   unsigned Opc;
3596   if (IsTyped) {
3597     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3598                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3599   } else if (IsFormat) {
3600     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3601                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3602   } else {
3603     switch (MemSize) {
3604     case 1:
3605       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3606       break;
3607     case 2:
3608       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3609       break;
3610     default:
3611       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3612       break;
3613     }
3614   }
3615 
3616   if (!VIndex)
3617     VIndex = B.buildConstant(S32, 0).getReg(0);
3618 
3619   auto MIB = B.buildInstr(Opc)
3620     .addUse(VData)              // vdata
3621     .addUse(RSrc)               // rsrc
3622     .addUse(VIndex)             // vindex
3623     .addUse(VOffset)            // voffset
3624     .addUse(SOffset)            // soffset
3625     .addImm(ImmOffset);         // offset(imm)
3626 
3627   if (IsTyped)
3628     MIB.addImm(Format);
3629 
3630   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3631      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3632      .addMemOperand(MMO);
3633 
3634   MI.eraseFromParent();
3635   return true;
3636 }
3637 
3638 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3639                                              MachineRegisterInfo &MRI,
3640                                              MachineIRBuilder &B,
3641                                              bool IsFormat,
3642                                              bool IsTyped) const {
3643   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3644   MachineMemOperand *MMO = *MI.memoperands_begin();
3645   const int MemSize = MMO->getSize();
3646   const LLT S32 = LLT::scalar(32);
3647 
3648   Register Dst = MI.getOperand(0).getReg();
3649   Register RSrc = MI.getOperand(2).getReg();
3650 
3651   // The typed intrinsics add an immediate after the registers.
3652   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3653 
3654   // The struct intrinsic variants add one additional operand over raw.
3655   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3656   Register VIndex;
3657   int OpOffset = 0;
3658   if (HasVIndex) {
3659     VIndex = MI.getOperand(3).getReg();
3660     OpOffset = 1;
3661   }
3662 
3663   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3664   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3665 
3666   unsigned Format = 0;
3667   if (IsTyped) {
3668     Format = MI.getOperand(5 + OpOffset).getImm();
3669     ++OpOffset;
3670   }
3671 
3672   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3673   unsigned ImmOffset;
3674   unsigned TotalOffset;
3675 
3676   LLT Ty = MRI.getType(Dst);
3677   LLT EltTy = Ty.getScalarType();
3678   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3679   const bool Unpacked = ST.hasUnpackedD16VMem();
3680 
3681   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3682   if (TotalOffset != 0)
3683     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3684 
3685   unsigned Opc;
3686 
3687   if (IsTyped) {
3688     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3689                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3690   } else if (IsFormat) {
3691     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3692                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3693   } else {
3694     switch (MemSize) {
3695     case 1:
3696       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3697       break;
3698     case 2:
3699       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3700       break;
3701     default:
3702       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3703       break;
3704     }
3705   }
3706 
3707   Register LoadDstReg;
3708 
3709   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3710   LLT UnpackedTy = Ty.changeElementSize(32);
3711 
3712   if (IsExtLoad)
3713     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3714   else if (Unpacked && IsD16 && Ty.isVector())
3715     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3716   else
3717     LoadDstReg = Dst;
3718 
3719   if (!VIndex)
3720     VIndex = B.buildConstant(S32, 0).getReg(0);
3721 
3722   auto MIB = B.buildInstr(Opc)
3723     .addDef(LoadDstReg)         // vdata
3724     .addUse(RSrc)               // rsrc
3725     .addUse(VIndex)             // vindex
3726     .addUse(VOffset)            // voffset
3727     .addUse(SOffset)            // soffset
3728     .addImm(ImmOffset);         // offset(imm)
3729 
3730   if (IsTyped)
3731     MIB.addImm(Format);
3732 
3733   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3734      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3735      .addMemOperand(MMO);
3736 
3737   if (LoadDstReg != Dst) {
3738     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3739 
3740     // Widen result for extending loads was widened.
3741     if (IsExtLoad)
3742       B.buildTrunc(Dst, LoadDstReg);
3743     else {
3744       // Repack to original 16-bit vector result
3745       // FIXME: G_TRUNC should work, but legalization currently fails
3746       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3747       SmallVector<Register, 4> Repack;
3748       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3749         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3750       B.buildMerge(Dst, Repack);
3751     }
3752   }
3753 
3754   MI.eraseFromParent();
3755   return true;
3756 }
3757 
3758 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3759                                                MachineIRBuilder &B,
3760                                                bool IsInc) const {
3761   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3762                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3763   B.buildInstr(Opc)
3764     .addDef(MI.getOperand(0).getReg())
3765     .addUse(MI.getOperand(2).getReg())
3766     .addUse(MI.getOperand(3).getReg())
3767     .cloneMemRefs(MI);
3768   MI.eraseFromParent();
3769   return true;
3770 }
3771 
3772 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3773   switch (IntrID) {
3774   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3775   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3776     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3777   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3778   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3779     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3780   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3781   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3782     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3783   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3784   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3785     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3786   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3787   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3788     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3789   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3790   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3791     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3792   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3793   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3794     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3795   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3796   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3797     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3798   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3799   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3800     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3801   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3802   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3803     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3804   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3805   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3806     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3807   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3808   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3809     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3810   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3811   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3812     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3813   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3814   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3815     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3816   default:
3817     llvm_unreachable("unhandled atomic opcode");
3818   }
3819 }
3820 
3821 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3822                                                MachineIRBuilder &B,
3823                                                Intrinsic::ID IID) const {
3824   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3825                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3826   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3827 
3828   Register Dst;
3829 
3830   int OpOffset = 0;
3831   if (HasReturn) {
3832     // A few FP atomics do not support return values.
3833     Dst = MI.getOperand(0).getReg();
3834   } else {
3835     OpOffset = -1;
3836   }
3837 
3838   Register VData = MI.getOperand(2 + OpOffset).getReg();
3839   Register CmpVal;
3840 
3841   if (IsCmpSwap) {
3842     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3843     ++OpOffset;
3844   }
3845 
3846   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3847   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3848 
3849   // The struct intrinsic variants add one additional operand over raw.
3850   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3851   Register VIndex;
3852   if (HasVIndex) {
3853     VIndex = MI.getOperand(4 + OpOffset).getReg();
3854     ++OpOffset;
3855   }
3856 
3857   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3858   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3859   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3860 
3861   MachineMemOperand *MMO = *MI.memoperands_begin();
3862 
3863   unsigned ImmOffset;
3864   unsigned TotalOffset;
3865   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3866   if (TotalOffset != 0)
3867     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3868 
3869   if (!VIndex)
3870     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3871 
3872   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3873 
3874   if (HasReturn)
3875     MIB.addDef(Dst);
3876 
3877   MIB.addUse(VData); // vdata
3878 
3879   if (IsCmpSwap)
3880     MIB.addReg(CmpVal);
3881 
3882   MIB.addUse(RSrc)               // rsrc
3883      .addUse(VIndex)             // vindex
3884      .addUse(VOffset)            // voffset
3885      .addUse(SOffset)            // soffset
3886      .addImm(ImmOffset)          // offset(imm)
3887      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3888      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3889      .addMemOperand(MMO);
3890 
3891   MI.eraseFromParent();
3892   return true;
3893 }
3894 
3895 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3896 /// vector with s16 typed elements.
3897 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3898                                         SmallVectorImpl<Register> &PackedAddrs,
3899                                         int AddrIdx, int DimIdx, int EndIdx,
3900                                         int NumGradients) {
3901   const LLT S16 = LLT::scalar(16);
3902   const LLT V2S16 = LLT::vector(2, 16);
3903 
3904   for (int I = AddrIdx; I < EndIdx; ++I) {
3905     MachineOperand &SrcOp = MI.getOperand(I);
3906     if (!SrcOp.isReg())
3907       continue; // _L to _LZ may have eliminated this.
3908 
3909     Register AddrReg = SrcOp.getReg();
3910 
3911     if (I < DimIdx) {
3912       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3913       PackedAddrs.push_back(AddrReg);
3914     } else {
3915       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3916       // derivatives dx/dh and dx/dv are packed with undef.
3917       if (((I + 1) >= EndIdx) ||
3918           ((NumGradients / 2) % 2 == 1 &&
3919            (I == DimIdx + (NumGradients / 2) - 1 ||
3920             I == DimIdx + NumGradients - 1)) ||
3921           // Check for _L to _LZ optimization
3922           !MI.getOperand(I + 1).isReg()) {
3923         PackedAddrs.push_back(
3924             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3925                 .getReg(0));
3926       } else {
3927         PackedAddrs.push_back(
3928             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3929                 .getReg(0));
3930         ++I;
3931       }
3932     }
3933   }
3934 }
3935 
3936 /// Convert from separate vaddr components to a single vector address register,
3937 /// and replace the remaining operands with $noreg.
3938 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3939                                      int DimIdx, int NumVAddrs) {
3940   const LLT S32 = LLT::scalar(32);
3941 
3942   SmallVector<Register, 8> AddrRegs;
3943   for (int I = 0; I != NumVAddrs; ++I) {
3944     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3945     if (SrcOp.isReg()) {
3946       AddrRegs.push_back(SrcOp.getReg());
3947       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3948     }
3949   }
3950 
3951   int NumAddrRegs = AddrRegs.size();
3952   if (NumAddrRegs != 1) {
3953     // Round up to 8 elements for v5-v7
3954     // FIXME: Missing intermediate sized register classes and instructions.
3955     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3956       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3957       auto Undef = B.buildUndef(S32);
3958       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3959       NumAddrRegs = RoundedNumRegs;
3960     }
3961 
3962     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3963     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3964   }
3965 
3966   for (int I = 1; I != NumVAddrs; ++I) {
3967     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3968     if (SrcOp.isReg())
3969       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3970   }
3971 }
3972 
3973 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3974 ///
3975 /// Depending on the subtarget, load/store with 16-bit element data need to be
3976 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3977 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3978 /// registers.
3979 ///
3980 /// We don't want to directly select image instructions just yet, but also want
3981 /// to exposes all register repacking to the legalizer/combiners. We also don't
3982 /// want a selected instrution entering RegBankSelect. In order to avoid
3983 /// defining a multitude of intermediate image instructions, directly hack on
3984 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3985 /// now unnecessary arguments with $noreg.
3986 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3987     MachineInstr &MI, MachineIRBuilder &B,
3988     GISelChangeObserver &Observer,
3989     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3990 
3991   const int NumDefs = MI.getNumExplicitDefs();
3992   bool IsTFE = NumDefs == 2;
3993   // We are only processing the operands of d16 image operations on subtargets
3994   // that use the unpacked register layout, or need to repack the TFE result.
3995 
3996   // TODO: Do we need to guard against already legalized intrinsics?
3997   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3998     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3999 
4000   MachineRegisterInfo *MRI = B.getMRI();
4001   const LLT S32 = LLT::scalar(32);
4002   const LLT S16 = LLT::scalar(16);
4003   const LLT V2S16 = LLT::vector(2, 16);
4004 
4005   // Index of first address argument
4006   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
4007 
4008   int NumVAddrs, NumGradients;
4009   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
4010   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
4011     getDMaskIdx(BaseOpcode, NumDefs);
4012   unsigned DMask = 0;
4013 
4014   // Check for 16 bit addresses and pack if true.
4015   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4016   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
4017   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
4018   const bool IsG16 = GradTy == S16;
4019   const bool IsA16 = AddrTy == S16;
4020 
4021   int DMaskLanes = 0;
4022   if (!BaseOpcode->Atomic) {
4023     DMask = MI.getOperand(DMaskIdx).getImm();
4024     if (BaseOpcode->Gather4) {
4025       DMaskLanes = 4;
4026     } else if (DMask != 0) {
4027       DMaskLanes = countPopulation(DMask);
4028     } else if (!IsTFE && !BaseOpcode->Store) {
4029       // If dmask is 0, this is a no-op load. This can be eliminated.
4030       B.buildUndef(MI.getOperand(0));
4031       MI.eraseFromParent();
4032       return true;
4033     }
4034   }
4035 
4036   Observer.changingInstr(MI);
4037   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4038 
4039   unsigned NewOpcode = NumDefs == 0 ?
4040     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4041 
4042   // Track that we legalized this
4043   MI.setDesc(B.getTII().get(NewOpcode));
4044 
4045   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4046   // dmask to be at least 1 otherwise the instruction will fail
4047   if (IsTFE && DMask == 0) {
4048     DMask = 0x1;
4049     DMaskLanes = 1;
4050     MI.getOperand(DMaskIdx).setImm(DMask);
4051   }
4052 
4053   if (BaseOpcode->Atomic) {
4054     Register VData0 = MI.getOperand(2).getReg();
4055     LLT Ty = MRI->getType(VData0);
4056 
4057     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4058     if (Ty.isVector())
4059       return false;
4060 
4061     if (BaseOpcode->AtomicX2) {
4062       Register VData1 = MI.getOperand(3).getReg();
4063       // The two values are packed in one register.
4064       LLT PackedTy = LLT::vector(2, Ty);
4065       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4066       MI.getOperand(2).setReg(Concat.getReg(0));
4067       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4068     }
4069   }
4070 
4071   int CorrectedNumVAddrs = NumVAddrs;
4072 
4073   // Optimize _L to _LZ when _L is zero
4074   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4075         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4076     const ConstantFP *ConstantLod;
4077     const int LodIdx = AddrIdx + NumVAddrs - 1;
4078 
4079     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4080       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4081         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4082         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4083           LZMappingInfo->LZ, ImageDimIntr->Dim);
4084 
4085         // The starting indexes should remain in the same place.
4086         --NumVAddrs;
4087         --CorrectedNumVAddrs;
4088 
4089         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4090           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4091         MI.RemoveOperand(LodIdx);
4092       }
4093     }
4094   }
4095 
4096   // Optimize _mip away, when 'lod' is zero
4097   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4098     int64_t ConstantLod;
4099     const int LodIdx = AddrIdx + NumVAddrs - 1;
4100 
4101     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4102       if (ConstantLod == 0) {
4103         // TODO: Change intrinsic opcode and remove operand instead or replacing
4104         // it with 0, as the _L to _LZ handling is done above.
4105         MI.getOperand(LodIdx).ChangeToImmediate(0);
4106         --CorrectedNumVAddrs;
4107       }
4108     }
4109   }
4110 
4111   // Rewrite the addressing register layout before doing anything else.
4112   if (IsA16 || IsG16) {
4113     if (IsA16) {
4114       // Target must support the feature and gradients need to be 16 bit too
4115       if (!ST.hasA16() || !IsG16)
4116         return false;
4117     } else if (!ST.hasG16())
4118       return false;
4119 
4120     if (NumVAddrs > 1) {
4121       SmallVector<Register, 4> PackedRegs;
4122       // Don't compress addresses for G16
4123       const int PackEndIdx =
4124           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4125       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4126                                   PackEndIdx, NumGradients);
4127 
4128       if (!IsA16) {
4129         // Add uncompressed address
4130         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4131           int AddrReg = MI.getOperand(I).getReg();
4132           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4133           PackedRegs.push_back(AddrReg);
4134         }
4135       }
4136 
4137       // See also below in the non-a16 branch
4138       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4139 
4140       if (!UseNSA && PackedRegs.size() > 1) {
4141         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4142         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4143         PackedRegs[0] = Concat.getReg(0);
4144         PackedRegs.resize(1);
4145       }
4146 
4147       const int NumPacked = PackedRegs.size();
4148       for (int I = 0; I != NumVAddrs; ++I) {
4149         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4150         if (!SrcOp.isReg()) {
4151           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4152           continue;
4153         }
4154 
4155         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4156 
4157         if (I < NumPacked)
4158           SrcOp.setReg(PackedRegs[I]);
4159         else
4160           SrcOp.setReg(AMDGPU::NoRegister);
4161       }
4162     }
4163   } else {
4164     // If the register allocator cannot place the address registers contiguously
4165     // without introducing moves, then using the non-sequential address encoding
4166     // is always preferable, since it saves VALU instructions and is usually a
4167     // wash in terms of code size or even better.
4168     //
4169     // However, we currently have no way of hinting to the register allocator
4170     // that MIMG addresses should be placed contiguously when it is possible to
4171     // do so, so force non-NSA for the common 2-address case as a heuristic.
4172     //
4173     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4174     // allocation when possible.
4175     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4176 
4177     if (!UseNSA && NumVAddrs > 1)
4178       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4179   }
4180 
4181   int Flags = 0;
4182   if (IsA16)
4183     Flags |= 1;
4184   if (IsG16)
4185     Flags |= 2;
4186   MI.addOperand(MachineOperand::CreateImm(Flags));
4187 
4188   if (BaseOpcode->Store) { // No TFE for stores?
4189     // TODO: Handle dmask trim
4190     Register VData = MI.getOperand(1).getReg();
4191     LLT Ty = MRI->getType(VData);
4192     if (!Ty.isVector() || Ty.getElementType() != S16)
4193       return true;
4194 
4195     Register RepackedReg = handleD16VData(B, *MRI, VData);
4196     if (RepackedReg != VData) {
4197       MI.getOperand(1).setReg(RepackedReg);
4198     }
4199 
4200     return true;
4201   }
4202 
4203   Register DstReg = MI.getOperand(0).getReg();
4204   LLT Ty = MRI->getType(DstReg);
4205   const LLT EltTy = Ty.getScalarType();
4206   const bool IsD16 = Ty.getScalarType() == S16;
4207   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4208 
4209   // Confirm that the return type is large enough for the dmask specified
4210   if (NumElts < DMaskLanes)
4211     return false;
4212 
4213   if (NumElts > 4 || DMaskLanes > 4)
4214     return false;
4215 
4216   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4217   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4218 
4219   // The raw dword aligned data component of the load. The only legal cases
4220   // where this matters should be when using the packed D16 format, for
4221   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4222   LLT RoundedTy;
4223 
4224   // S32 vector to to cover all data, plus TFE result element.
4225   LLT TFETy;
4226 
4227   // Register type to use for each loaded component. Will be S32 or V2S16.
4228   LLT RegTy;
4229 
4230   if (IsD16 && ST.hasUnpackedD16VMem()) {
4231     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4232     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4233     RegTy = S32;
4234   } else {
4235     unsigned EltSize = EltTy.getSizeInBits();
4236     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4237     unsigned RoundedSize = 32 * RoundedElts;
4238     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4239     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4240     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4241   }
4242 
4243   // The return type does not need adjustment.
4244   // TODO: Should we change s16 case to s32 or <2 x s16>?
4245   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4246     return true;
4247 
4248   Register Dst1Reg;
4249 
4250   // Insert after the instruction.
4251   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4252 
4253   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4254   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4255   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4256   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4257 
4258   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4259 
4260   MI.getOperand(0).setReg(NewResultReg);
4261 
4262   // In the IR, TFE is supposed to be used with a 2 element struct return
4263   // type. The intruction really returns these two values in one contiguous
4264   // register, with one additional dword beyond the loaded data. Rewrite the
4265   // return type to use a single register result.
4266 
4267   if (IsTFE) {
4268     Dst1Reg = MI.getOperand(1).getReg();
4269     if (MRI->getType(Dst1Reg) != S32)
4270       return false;
4271 
4272     // TODO: Make sure the TFE operand bit is set.
4273     MI.RemoveOperand(1);
4274 
4275     // Handle the easy case that requires no repack instructions.
4276     if (Ty == S32) {
4277       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4278       return true;
4279     }
4280   }
4281 
4282   // Now figure out how to copy the new result register back into the old
4283   // result.
4284   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4285 
4286   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4287 
4288   if (ResultNumRegs == 1) {
4289     assert(!IsTFE);
4290     ResultRegs[0] = NewResultReg;
4291   } else {
4292     // We have to repack into a new vector of some kind.
4293     for (int I = 0; I != NumDataRegs; ++I)
4294       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4295     B.buildUnmerge(ResultRegs, NewResultReg);
4296 
4297     // Drop the final TFE element to get the data part. The TFE result is
4298     // directly written to the right place already.
4299     if (IsTFE)
4300       ResultRegs.resize(NumDataRegs);
4301   }
4302 
4303   // For an s16 scalar result, we form an s32 result with a truncate regardless
4304   // of packed vs. unpacked.
4305   if (IsD16 && !Ty.isVector()) {
4306     B.buildTrunc(DstReg, ResultRegs[0]);
4307     return true;
4308   }
4309 
4310   // Avoid a build/concat_vector of 1 entry.
4311   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4312     B.buildBitcast(DstReg, ResultRegs[0]);
4313     return true;
4314   }
4315 
4316   assert(Ty.isVector());
4317 
4318   if (IsD16) {
4319     // For packed D16 results with TFE enabled, all the data components are
4320     // S32. Cast back to the expected type.
4321     //
4322     // TODO: We don't really need to use load s32 elements. We would only need one
4323     // cast for the TFE result if a multiple of v2s16 was used.
4324     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4325       for (Register &Reg : ResultRegs)
4326         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4327     } else if (ST.hasUnpackedD16VMem()) {
4328       for (Register &Reg : ResultRegs)
4329         Reg = B.buildTrunc(S16, Reg).getReg(0);
4330     }
4331   }
4332 
4333   auto padWithUndef = [&](LLT Ty, int NumElts) {
4334     if (NumElts == 0)
4335       return;
4336     Register Undef = B.buildUndef(Ty).getReg(0);
4337     for (int I = 0; I != NumElts; ++I)
4338       ResultRegs.push_back(Undef);
4339   };
4340 
4341   // Pad out any elements eliminated due to the dmask.
4342   LLT ResTy = MRI->getType(ResultRegs[0]);
4343   if (!ResTy.isVector()) {
4344     padWithUndef(ResTy, NumElts - ResultRegs.size());
4345     B.buildBuildVector(DstReg, ResultRegs);
4346     return true;
4347   }
4348 
4349   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4350   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4351 
4352   // Deal with the one annoying legal case.
4353   const LLT V3S16 = LLT::vector(3, 16);
4354   if (Ty == V3S16) {
4355     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4356     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4357     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4358     return true;
4359   }
4360 
4361   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4362   B.buildConcatVectors(DstReg, ResultRegs);
4363   return true;
4364 }
4365 
4366 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4367   LegalizerHelper &Helper, MachineInstr &MI) const {
4368   MachineIRBuilder &B = Helper.MIRBuilder;
4369   GISelChangeObserver &Observer = Helper.Observer;
4370 
4371   Register Dst = MI.getOperand(0).getReg();
4372   LLT Ty = B.getMRI()->getType(Dst);
4373   unsigned Size = Ty.getSizeInBits();
4374   MachineFunction &MF = B.getMF();
4375 
4376   Observer.changingInstr(MI);
4377 
4378   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4379     Ty = getBitcastRegisterType(Ty);
4380     Helper.bitcastDst(MI, Ty, 0);
4381     Dst = MI.getOperand(0).getReg();
4382     B.setInsertPt(B.getMBB(), MI);
4383   }
4384 
4385   // FIXME: We don't really need this intermediate instruction. The intrinsic
4386   // should be fixed to have a memory operand. Since it's readnone, we're not
4387   // allowed to add one.
4388   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4389   MI.RemoveOperand(1); // Remove intrinsic ID
4390 
4391   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4392   // TODO: Should this use datalayout alignment?
4393   const unsigned MemSize = (Size + 7) / 8;
4394   const Align MemAlign(4);
4395   MachineMemOperand *MMO = MF.getMachineMemOperand(
4396       MachinePointerInfo(),
4397       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4398           MachineMemOperand::MOInvariant,
4399       MemSize, MemAlign);
4400   MI.addMemOperand(MF, MMO);
4401 
4402   // There are no 96-bit result scalar loads, but widening to 128-bit should
4403   // always be legal. We may need to restore this to a 96-bit result if it turns
4404   // out this needs to be converted to a vector load during RegBankSelect.
4405   if (!isPowerOf2_32(Size)) {
4406     if (Ty.isVector())
4407       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4408     else
4409       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4410   }
4411 
4412   Observer.changedInstr(MI);
4413   return true;
4414 }
4415 
4416 // TODO: Move to selection
4417 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4418                                                 MachineRegisterInfo &MRI,
4419                                                 MachineIRBuilder &B) const {
4420   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4421   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4422       !ST.isTrapHandlerEnabled()) {
4423     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4424   } else {
4425     // Pass queue pointer to trap handler as input, and insert trap instruction
4426     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4427     MachineRegisterInfo &MRI = *B.getMRI();
4428 
4429     Register LiveIn =
4430       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4431     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4432       return false;
4433 
4434     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4435     B.buildCopy(SGPR01, LiveIn);
4436     B.buildInstr(AMDGPU::S_TRAP)
4437         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4438         .addReg(SGPR01, RegState::Implicit);
4439   }
4440 
4441   MI.eraseFromParent();
4442   return true;
4443 }
4444 
4445 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4446     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4447   // Is non-HSA path or trap-handler disabled? then, report a warning
4448   // accordingly
4449   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4450       !ST.isTrapHandlerEnabled()) {
4451     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4452                                      "debugtrap handler not supported",
4453                                      MI.getDebugLoc(), DS_Warning);
4454     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4455     Ctx.diagnose(NoTrap);
4456   } else {
4457     // Insert debug-trap instruction
4458     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4459   }
4460 
4461   MI.eraseFromParent();
4462   return true;
4463 }
4464 
4465 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4466                                             MachineInstr &MI) const {
4467   MachineIRBuilder &B = Helper.MIRBuilder;
4468   MachineRegisterInfo &MRI = *B.getMRI();
4469 
4470   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4471   auto IntrID = MI.getIntrinsicID();
4472   switch (IntrID) {
4473   case Intrinsic::amdgcn_if:
4474   case Intrinsic::amdgcn_else: {
4475     MachineInstr *Br = nullptr;
4476     MachineBasicBlock *UncondBrTarget = nullptr;
4477     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4478       const SIRegisterInfo *TRI
4479         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4480 
4481       Register Def = MI.getOperand(1).getReg();
4482       Register Use = MI.getOperand(3).getReg();
4483 
4484       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4485       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4486       if (IntrID == Intrinsic::amdgcn_if) {
4487         B.buildInstr(AMDGPU::SI_IF)
4488           .addDef(Def)
4489           .addUse(Use)
4490           .addMBB(UncondBrTarget);
4491       } else {
4492         B.buildInstr(AMDGPU::SI_ELSE)
4493           .addDef(Def)
4494           .addUse(Use)
4495           .addMBB(UncondBrTarget)
4496           .addImm(0);
4497       }
4498 
4499       if (Br) {
4500         Br->getOperand(0).setMBB(CondBrTarget);
4501       } else {
4502         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4503         // since we're swapping branch targets it needs to be reinserted.
4504         // FIXME: IRTranslator should probably not do this
4505         B.buildBr(*CondBrTarget);
4506       }
4507 
4508       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4509       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4510       MI.eraseFromParent();
4511       BrCond->eraseFromParent();
4512       return true;
4513     }
4514 
4515     return false;
4516   }
4517   case Intrinsic::amdgcn_loop: {
4518     MachineInstr *Br = nullptr;
4519     MachineBasicBlock *UncondBrTarget = nullptr;
4520     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4521       const SIRegisterInfo *TRI
4522         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4523 
4524       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4525       Register Reg = MI.getOperand(2).getReg();
4526 
4527       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4528       B.buildInstr(AMDGPU::SI_LOOP)
4529         .addUse(Reg)
4530         .addMBB(UncondBrTarget);
4531 
4532       if (Br)
4533         Br->getOperand(0).setMBB(CondBrTarget);
4534       else
4535         B.buildBr(*CondBrTarget);
4536 
4537       MI.eraseFromParent();
4538       BrCond->eraseFromParent();
4539       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4540       return true;
4541     }
4542 
4543     return false;
4544   }
4545   case Intrinsic::amdgcn_kernarg_segment_ptr:
4546     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4547       // This only makes sense to call in a kernel, so just lower to null.
4548       B.buildConstant(MI.getOperand(0).getReg(), 0);
4549       MI.eraseFromParent();
4550       return true;
4551     }
4552 
4553     return legalizePreloadedArgIntrin(
4554       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4555   case Intrinsic::amdgcn_implicitarg_ptr:
4556     return legalizeImplicitArgPtr(MI, MRI, B);
4557   case Intrinsic::amdgcn_workitem_id_x:
4558     return legalizePreloadedArgIntrin(MI, MRI, B,
4559                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4560   case Intrinsic::amdgcn_workitem_id_y:
4561     return legalizePreloadedArgIntrin(MI, MRI, B,
4562                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4563   case Intrinsic::amdgcn_workitem_id_z:
4564     return legalizePreloadedArgIntrin(MI, MRI, B,
4565                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4566   case Intrinsic::amdgcn_workgroup_id_x:
4567     return legalizePreloadedArgIntrin(MI, MRI, B,
4568                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4569   case Intrinsic::amdgcn_workgroup_id_y:
4570     return legalizePreloadedArgIntrin(MI, MRI, B,
4571                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4572   case Intrinsic::amdgcn_workgroup_id_z:
4573     return legalizePreloadedArgIntrin(MI, MRI, B,
4574                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4575   case Intrinsic::amdgcn_dispatch_ptr:
4576     return legalizePreloadedArgIntrin(MI, MRI, B,
4577                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4578   case Intrinsic::amdgcn_queue_ptr:
4579     return legalizePreloadedArgIntrin(MI, MRI, B,
4580                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4581   case Intrinsic::amdgcn_implicit_buffer_ptr:
4582     return legalizePreloadedArgIntrin(
4583       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4584   case Intrinsic::amdgcn_dispatch_id:
4585     return legalizePreloadedArgIntrin(MI, MRI, B,
4586                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4587   case Intrinsic::amdgcn_fdiv_fast:
4588     return legalizeFDIVFastIntrin(MI, MRI, B);
4589   case Intrinsic::amdgcn_is_shared:
4590     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4591   case Intrinsic::amdgcn_is_private:
4592     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4593   case Intrinsic::amdgcn_wavefrontsize: {
4594     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4595     MI.eraseFromParent();
4596     return true;
4597   }
4598   case Intrinsic::amdgcn_s_buffer_load:
4599     return legalizeSBufferLoad(Helper, MI);
4600   case Intrinsic::amdgcn_raw_buffer_store:
4601   case Intrinsic::amdgcn_struct_buffer_store:
4602     return legalizeBufferStore(MI, MRI, B, false, false);
4603   case Intrinsic::amdgcn_raw_buffer_store_format:
4604   case Intrinsic::amdgcn_struct_buffer_store_format:
4605     return legalizeBufferStore(MI, MRI, B, false, true);
4606   case Intrinsic::amdgcn_raw_tbuffer_store:
4607   case Intrinsic::amdgcn_struct_tbuffer_store:
4608     return legalizeBufferStore(MI, MRI, B, true, true);
4609   case Intrinsic::amdgcn_raw_buffer_load:
4610   case Intrinsic::amdgcn_struct_buffer_load:
4611     return legalizeBufferLoad(MI, MRI, B, false, false);
4612   case Intrinsic::amdgcn_raw_buffer_load_format:
4613   case Intrinsic::amdgcn_struct_buffer_load_format:
4614     return legalizeBufferLoad(MI, MRI, B, true, false);
4615   case Intrinsic::amdgcn_raw_tbuffer_load:
4616   case Intrinsic::amdgcn_struct_tbuffer_load:
4617     return legalizeBufferLoad(MI, MRI, B, true, true);
4618   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4619   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4620   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4621   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4622   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4623   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4624   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4625   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4626   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4627   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4628   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4629   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4630   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4631   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4632   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4633   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4634   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4635   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4636   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4637   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4638   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4639   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4640   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4641   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4642   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4643   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4644   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4645   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4646     return legalizeBufferAtomic(MI, B, IntrID);
4647   case Intrinsic::amdgcn_atomic_inc:
4648     return legalizeAtomicIncDec(MI, B, true);
4649   case Intrinsic::amdgcn_atomic_dec:
4650     return legalizeAtomicIncDec(MI, B, false);
4651   case Intrinsic::trap:
4652     return legalizeTrapIntrinsic(MI, MRI, B);
4653   case Intrinsic::debugtrap:
4654     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4655   case Intrinsic::amdgcn_rsq_clamp:
4656     return legalizeRsqClampIntrinsic(MI, MRI, B);
4657   case Intrinsic::amdgcn_ds_fadd:
4658   case Intrinsic::amdgcn_ds_fmin:
4659   case Intrinsic::amdgcn_ds_fmax:
4660     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4661   default: {
4662     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4663             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4664       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4665     return true;
4666   }
4667   }
4668 
4669   return true;
4670 }
4671