1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 /// Return true if we should legalize a load by widening an odd sized memory
364 /// access up to the alignment. Note this case when the memory access itself
365 /// changes, not the size of the result register.
366 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
367                             unsigned AlignInBits, unsigned AddrSpace,
368                             unsigned Opcode) {
369   // We don't want to widen cases that are naturally legal.
370   if (isPowerOf2_32(SizeInBits))
371     return false;
372 
373   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
374   // end up widening these for a scalar load during RegBankSelect, since there
375   // aren't 96-bit scalar loads.
376   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
377     return false;
378 
379   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
380     return false;
381 
382   // A load is known dereferenceable up to the alignment, so it's legal to widen
383   // to it.
384   //
385   // TODO: Could check dereferenceable for less aligned cases.
386   unsigned RoundedSize = NextPowerOf2(SizeInBits);
387   if (AlignInBits < RoundedSize)
388     return false;
389 
390   // Do not widen if it would introduce a slow unaligned load.
391   const SITargetLowering *TLI = ST.getTargetLowering();
392   bool Fast = false;
393   return TLI->allowsMisalignedMemoryAccessesImpl(
394              RoundedSize, AddrSpace, Align(AlignInBits / 8),
395              MachineMemOperand::MOLoad, &Fast) &&
396          Fast;
397 }
398 
399 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
400                             unsigned Opcode) {
401   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
402     return false;
403 
404   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
405                          Query.MMODescrs[0].AlignInBits,
406                          Query.Types[1].getAddressSpace(), Opcode);
407 }
408 
409 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
410                                          const GCNTargetMachine &TM)
411   :  ST(ST_) {
412   using namespace TargetOpcode;
413 
414   auto GetAddrSpacePtr = [&TM](unsigned AS) {
415     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
416   };
417 
418   const LLT S1 = LLT::scalar(1);
419   const LLT S16 = LLT::scalar(16);
420   const LLT S32 = LLT::scalar(32);
421   const LLT S64 = LLT::scalar(64);
422   const LLT S128 = LLT::scalar(128);
423   const LLT S256 = LLT::scalar(256);
424   const LLT S512 = LLT::scalar(512);
425   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
426 
427   const LLT V2S16 = LLT::vector(2, 16);
428   const LLT V4S16 = LLT::vector(4, 16);
429 
430   const LLT V2S32 = LLT::vector(2, 32);
431   const LLT V3S32 = LLT::vector(3, 32);
432   const LLT V4S32 = LLT::vector(4, 32);
433   const LLT V5S32 = LLT::vector(5, 32);
434   const LLT V6S32 = LLT::vector(6, 32);
435   const LLT V7S32 = LLT::vector(7, 32);
436   const LLT V8S32 = LLT::vector(8, 32);
437   const LLT V9S32 = LLT::vector(9, 32);
438   const LLT V10S32 = LLT::vector(10, 32);
439   const LLT V11S32 = LLT::vector(11, 32);
440   const LLT V12S32 = LLT::vector(12, 32);
441   const LLT V13S32 = LLT::vector(13, 32);
442   const LLT V14S32 = LLT::vector(14, 32);
443   const LLT V15S32 = LLT::vector(15, 32);
444   const LLT V16S32 = LLT::vector(16, 32);
445   const LLT V32S32 = LLT::vector(32, 32);
446 
447   const LLT V2S64 = LLT::vector(2, 64);
448   const LLT V3S64 = LLT::vector(3, 64);
449   const LLT V4S64 = LLT::vector(4, 64);
450   const LLT V5S64 = LLT::vector(5, 64);
451   const LLT V6S64 = LLT::vector(6, 64);
452   const LLT V7S64 = LLT::vector(7, 64);
453   const LLT V8S64 = LLT::vector(8, 64);
454   const LLT V16S64 = LLT::vector(16, 64);
455 
456   std::initializer_list<LLT> AllS32Vectors =
457     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
458      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
459   std::initializer_list<LLT> AllS64Vectors =
460     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
461 
462   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
463   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
464   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
465   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
466   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
467   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
468   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
469 
470   const LLT CodePtr = FlatPtr;
471 
472   const std::initializer_list<LLT> AddrSpaces64 = {
473     GlobalPtr, ConstantPtr, FlatPtr
474   };
475 
476   const std::initializer_list<LLT> AddrSpaces32 = {
477     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
478   };
479 
480   const std::initializer_list<LLT> FPTypesBase = {
481     S32, S64
482   };
483 
484   const std::initializer_list<LLT> FPTypes16 = {
485     S32, S64, S16
486   };
487 
488   const std::initializer_list<LLT> FPTypesPK16 = {
489     S32, S64, S16, V2S16
490   };
491 
492   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
493 
494   setAction({G_BRCOND, S1}, Legal); // VCC branches
495   setAction({G_BRCOND, S32}, Legal); // SCC branches
496 
497   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
498   // elements for v3s16
499   getActionDefinitionsBuilder(G_PHI)
500     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
501     .legalFor(AllS32Vectors)
502     .legalFor(AllS64Vectors)
503     .legalFor(AddrSpaces64)
504     .legalFor(AddrSpaces32)
505     .legalIf(isPointer(0))
506     .clampScalar(0, S16, S256)
507     .widenScalarToNextPow2(0, 32)
508     .clampMaxNumElements(0, S32, 16)
509     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
510     .scalarize(0);
511 
512   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
513     // Full set of gfx9 features.
514     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
515       .legalFor({S32, S16, V2S16})
516       .clampScalar(0, S16, S32)
517       .clampMaxNumElements(0, S16, 2)
518       .scalarize(0)
519       .widenScalarToNextPow2(0, 32);
520 
521     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
522       .legalFor({S32, S16, V2S16}) // Clamp modifier
523       .minScalarOrElt(0, S16)
524       .clampMaxNumElements(0, S16, 2)
525       .scalarize(0)
526       .widenScalarToNextPow2(0, 32)
527       .lower();
528   } else if (ST.has16BitInsts()) {
529     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
530       .legalFor({S32, S16})
531       .clampScalar(0, S16, S32)
532       .scalarize(0)
533       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
534 
535     // Technically the saturating operations require clamp bit support, but this
536     // was introduced at the same time as 16-bit operations.
537     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
538       .legalFor({S32, S16}) // Clamp modifier
539       .minScalar(0, S16)
540       .scalarize(0)
541       .widenScalarToNextPow2(0, 16)
542       .lower();
543 
544     // We're just lowering this, but it helps get a better result to try to
545     // coerce to the desired type first.
546     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
547       .minScalar(0, S16)
548       .scalarize(0)
549       .lower();
550   } else {
551     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
552       .legalFor({S32})
553       .clampScalar(0, S32, S32)
554       .scalarize(0);
555 
556     if (ST.hasIntClamp()) {
557       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
558         .legalFor({S32}) // Clamp modifier.
559         .scalarize(0)
560         .minScalarOrElt(0, S32)
561         .lower();
562     } else {
563       // Clamp bit support was added in VI, along with 16-bit operations.
564       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
565         .minScalar(0, S32)
566         .scalarize(0)
567         .lower();
568     }
569 
570     // FIXME: DAG expansion gets better results. The widening uses the smaller
571     // range values and goes for the min/max lowering directly.
572     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
573       .minScalar(0, S32)
574       .scalarize(0)
575       .lower();
576   }
577 
578   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
579     .customFor({S32, S64})
580     .clampScalar(0, S32, S64)
581     .widenScalarToNextPow2(0, 32)
582     .scalarize(0);
583 
584   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
585     .legalFor({S32})
586     .clampScalar(0, S32, S32)
587     .scalarize(0);
588 
589   // Report legal for any types we can handle anywhere. For the cases only legal
590   // on the SALU, RegBankSelect will be able to re-legalize.
591   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
592     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
593     .clampScalar(0, S32, S64)
594     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
595     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
596     .widenScalarToNextPow2(0)
597     .scalarize(0);
598 
599   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
600                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
601     .legalFor({{S32, S1}, {S32, S32}})
602     .minScalar(0, S32)
603     // TODO: .scalarize(0)
604     .lower();
605 
606   getActionDefinitionsBuilder(G_BITCAST)
607     // Don't worry about the size constraint.
608     .legalIf(all(isRegisterType(0), isRegisterType(1)))
609     .lower();
610 
611 
612   getActionDefinitionsBuilder(G_CONSTANT)
613     .legalFor({S1, S32, S64, S16, GlobalPtr,
614                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
615     .legalIf(isPointer(0))
616     .clampScalar(0, S32, S64)
617     .widenScalarToNextPow2(0);
618 
619   getActionDefinitionsBuilder(G_FCONSTANT)
620     .legalFor({S32, S64, S16})
621     .clampScalar(0, S16, S64);
622 
623   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
624       .legalIf(isRegisterType(0))
625       // s1 and s16 are special cases because they have legal operations on
626       // them, but don't really occupy registers in the normal way.
627       .legalFor({S1, S16})
628       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
629       .clampScalarOrElt(0, S32, MaxScalar)
630       .widenScalarToNextPow2(0, 32)
631       .clampMaxNumElements(0, S32, 16);
632 
633   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
634 
635   // If the amount is divergent, we have to do a wave reduction to get the
636   // maximum value, so this is expanded during RegBankSelect.
637   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
638     .legalFor({{PrivatePtr, S32}});
639 
640   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
641     .customIf(typeIsNot(0, PrivatePtr));
642 
643   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
644 
645   auto &FPOpActions = getActionDefinitionsBuilder(
646     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
647     .legalFor({S32, S64});
648   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
649     .customFor({S32, S64});
650   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
651     .customFor({S32, S64});
652 
653   if (ST.has16BitInsts()) {
654     if (ST.hasVOP3PInsts())
655       FPOpActions.legalFor({S16, V2S16});
656     else
657       FPOpActions.legalFor({S16});
658 
659     TrigActions.customFor({S16});
660     FDIVActions.customFor({S16});
661   }
662 
663   auto &MinNumMaxNum = getActionDefinitionsBuilder({
664       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
665 
666   if (ST.hasVOP3PInsts()) {
667     MinNumMaxNum.customFor(FPTypesPK16)
668       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
669       .clampMaxNumElements(0, S16, 2)
670       .clampScalar(0, S16, S64)
671       .scalarize(0);
672   } else if (ST.has16BitInsts()) {
673     MinNumMaxNum.customFor(FPTypes16)
674       .clampScalar(0, S16, S64)
675       .scalarize(0);
676   } else {
677     MinNumMaxNum.customFor(FPTypesBase)
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   }
681 
682   if (ST.hasVOP3PInsts())
683     FPOpActions.clampMaxNumElements(0, S16, 2);
684 
685   FPOpActions
686     .scalarize(0)
687     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
688 
689   TrigActions
690     .scalarize(0)
691     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
692 
693   FDIVActions
694     .scalarize(0)
695     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
696 
697   getActionDefinitionsBuilder({G_FNEG, G_FABS})
698     .legalFor(FPTypesPK16)
699     .clampMaxNumElements(0, S16, 2)
700     .scalarize(0)
701     .clampScalar(0, S16, S64);
702 
703   if (ST.has16BitInsts()) {
704     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
705       .legalFor({S32, S64, S16})
706       .scalarize(0)
707       .clampScalar(0, S16, S64);
708   } else {
709     getActionDefinitionsBuilder(G_FSQRT)
710       .legalFor({S32, S64})
711       .scalarize(0)
712       .clampScalar(0, S32, S64);
713 
714     if (ST.hasFractBug()) {
715       getActionDefinitionsBuilder(G_FFLOOR)
716         .customFor({S64})
717         .legalFor({S32, S64})
718         .scalarize(0)
719         .clampScalar(0, S32, S64);
720     } else {
721       getActionDefinitionsBuilder(G_FFLOOR)
722         .legalFor({S32, S64})
723         .scalarize(0)
724         .clampScalar(0, S32, S64);
725     }
726   }
727 
728   getActionDefinitionsBuilder(G_FPTRUNC)
729     .legalFor({{S32, S64}, {S16, S32}})
730     .scalarize(0)
731     .lower();
732 
733   getActionDefinitionsBuilder(G_FPEXT)
734     .legalFor({{S64, S32}, {S32, S16}})
735     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
736     .scalarize(0);
737 
738   getActionDefinitionsBuilder(G_FSUB)
739       // Use actual fsub instruction
740       .legalFor({S32})
741       // Must use fadd + fneg
742       .lowerFor({S64, S16, V2S16})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746   // Whether this is legal depends on the floating point mode for the function.
747   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
748   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
749     FMad.customFor({S32, S16});
750   else if (ST.hasMadMacF32Insts())
751     FMad.customFor({S32});
752   else if (ST.hasMadF16())
753     FMad.customFor({S16});
754   FMad.scalarize(0)
755       .lower();
756 
757   auto &FRem = getActionDefinitionsBuilder(G_FREM);
758   if (ST.has16BitInsts()) {
759     FRem.customFor({S16, S32, S64});
760   } else {
761     FRem.minScalar(0, S32)
762         .customFor({S32, S64});
763   }
764   FRem.scalarize(0);
765 
766   // TODO: Do we need to clamp maximum bitwidth?
767   getActionDefinitionsBuilder(G_TRUNC)
768     .legalIf(isScalar(0))
769     .legalFor({{V2S16, V2S32}})
770     .clampMaxNumElements(0, S16, 2)
771     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
772     // situations (like an invalid implicit use), we don't want to infinite loop
773     // in the legalizer.
774     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
775     .alwaysLegal();
776 
777   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
778     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
779                {S32, S1}, {S64, S1}, {S16, S1}})
780     .scalarize(0)
781     .clampScalar(0, S32, S64)
782     .widenScalarToNextPow2(1, 32);
783 
784   // TODO: Split s1->s64 during regbankselect for VALU.
785   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
786     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
787     .lowerFor({{S32, S64}})
788     .lowerIf(typeIs(1, S1))
789     .customFor({{S64, S64}});
790   if (ST.has16BitInsts())
791     IToFP.legalFor({{S16, S16}});
792   IToFP.clampScalar(1, S32, S64)
793        .minScalar(0, S32)
794        .scalarize(0)
795        .widenScalarToNextPow2(1);
796 
797   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
798     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
799     .customFor({{S64, S64}})
800     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
801   if (ST.has16BitInsts())
802     FPToI.legalFor({{S16, S16}});
803   else
804     FPToI.minScalar(1, S32);
805 
806   FPToI.minScalar(0, S32)
807        .scalarize(0)
808        .lower();
809 
810   // Lower roundeven into G_FRINT
811   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
812     .scalarize(0)
813     .lower();
814 
815   if (ST.has16BitInsts()) {
816     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
817       .legalFor({S16, S32, S64})
818       .clampScalar(0, S16, S64)
819       .scalarize(0);
820   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
821     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
822       .legalFor({S32, S64})
823       .clampScalar(0, S32, S64)
824       .scalarize(0);
825   } else {
826     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
827       .legalFor({S32})
828       .customFor({S64})
829       .clampScalar(0, S32, S64)
830       .scalarize(0);
831   }
832 
833   getActionDefinitionsBuilder(G_PTR_ADD)
834     .legalIf(all(isPointer(0), sameSize(0, 1)))
835     .scalarize(0)
836     .scalarSameSizeAs(1, 0);
837 
838   getActionDefinitionsBuilder(G_PTRMASK)
839     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
840     .scalarSameSizeAs(1, 0)
841     .scalarize(0);
842 
843   auto &CmpBuilder =
844     getActionDefinitionsBuilder(G_ICMP)
845     // The compare output type differs based on the register bank of the output,
846     // so make both s1 and s32 legal.
847     //
848     // Scalar compares producing output in scc will be promoted to s32, as that
849     // is the allocatable register type that will be needed for the copy from
850     // scc. This will be promoted during RegBankSelect, and we assume something
851     // before that won't try to use s32 result types.
852     //
853     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
854     // bank.
855     .legalForCartesianProduct(
856       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
857     .legalForCartesianProduct(
858       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
859   if (ST.has16BitInsts()) {
860     CmpBuilder.legalFor({{S1, S16}});
861   }
862 
863   CmpBuilder
864     .widenScalarToNextPow2(1)
865     .clampScalar(1, S32, S64)
866     .scalarize(0)
867     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
868 
869   getActionDefinitionsBuilder(G_FCMP)
870     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
871     .widenScalarToNextPow2(1)
872     .clampScalar(1, S32, S64)
873     .scalarize(0);
874 
875   // FIXME: fpow has a selection pattern that should move to custom lowering.
876   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
877   if (ST.has16BitInsts())
878     Exp2Ops.legalFor({S32, S16});
879   else
880     Exp2Ops.legalFor({S32});
881   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
882   Exp2Ops.scalarize(0);
883 
884   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
885   if (ST.has16BitInsts())
886     ExpOps.customFor({{S32}, {S16}});
887   else
888     ExpOps.customFor({S32});
889   ExpOps.clampScalar(0, MinScalarFPTy, S32)
890         .scalarize(0);
891 
892   getActionDefinitionsBuilder(G_FPOWI)
893     .clampScalar(0, MinScalarFPTy, S32)
894     .lower();
895 
896   // The 64-bit versions produce 32-bit results, but only on the SALU.
897   getActionDefinitionsBuilder(G_CTPOP)
898     .legalFor({{S32, S32}, {S32, S64}})
899     .clampScalar(0, S32, S32)
900     .clampScalar(1, S32, S64)
901     .scalarize(0)
902     .widenScalarToNextPow2(0, 32)
903     .widenScalarToNextPow2(1, 32);
904 
905   // The hardware instructions return a different result on 0 than the generic
906   // instructions expect. The hardware produces -1, but these produce the
907   // bitwidth.
908   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
909     .scalarize(0)
910     .clampScalar(0, S32, S32)
911     .clampScalar(1, S32, S64)
912     .widenScalarToNextPow2(0, 32)
913     .widenScalarToNextPow2(1, 32)
914     .lower();
915 
916   // The 64-bit versions produce 32-bit results, but only on the SALU.
917   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
918     .legalFor({{S32, S32}, {S32, S64}})
919     .clampScalar(0, S32, S32)
920     .clampScalar(1, S32, S64)
921     .scalarize(0)
922     .widenScalarToNextPow2(0, 32)
923     .widenScalarToNextPow2(1, 32);
924 
925   getActionDefinitionsBuilder(G_BITREVERSE)
926     .legalFor({S32})
927     .clampScalar(0, S32, S32)
928     .scalarize(0);
929 
930   if (ST.has16BitInsts()) {
931     getActionDefinitionsBuilder(G_BSWAP)
932       .legalFor({S16, S32, V2S16})
933       .clampMaxNumElements(0, S16, 2)
934       // FIXME: Fixing non-power-of-2 before clamp is workaround for
935       // narrowScalar limitation.
936       .widenScalarToNextPow2(0)
937       .clampScalar(0, S16, S32)
938       .scalarize(0);
939 
940     if (ST.hasVOP3PInsts()) {
941       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
942         .legalFor({S32, S16, V2S16})
943         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
944         .clampMaxNumElements(0, S16, 2)
945         .minScalar(0, S16)
946         .widenScalarToNextPow2(0)
947         .scalarize(0)
948         .lower();
949     } else {
950       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
951         .legalFor({S32, S16})
952         .widenScalarToNextPow2(0)
953         .minScalar(0, S16)
954         .scalarize(0)
955         .lower();
956     }
957   } else {
958     // TODO: Should have same legality without v_perm_b32
959     getActionDefinitionsBuilder(G_BSWAP)
960       .legalFor({S32})
961       .lowerIf(scalarNarrowerThan(0, 32))
962       // FIXME: Fixing non-power-of-2 before clamp is workaround for
963       // narrowScalar limitation.
964       .widenScalarToNextPow2(0)
965       .maxScalar(0, S32)
966       .scalarize(0)
967       .lower();
968 
969     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970       .legalFor({S32})
971       .minScalar(0, S32)
972       .widenScalarToNextPow2(0)
973       .scalarize(0)
974       .lower();
975   }
976 
977   getActionDefinitionsBuilder(G_INTTOPTR)
978     // List the common cases
979     .legalForCartesianProduct(AddrSpaces64, {S64})
980     .legalForCartesianProduct(AddrSpaces32, {S32})
981     .scalarize(0)
982     // Accept any address space as long as the size matches
983     .legalIf(sameSize(0, 1))
984     .widenScalarIf(smallerThan(1, 0),
985       [](const LegalityQuery &Query) {
986         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
987       })
988     .narrowScalarIf(largerThan(1, 0),
989       [](const LegalityQuery &Query) {
990         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
991       });
992 
993   getActionDefinitionsBuilder(G_PTRTOINT)
994     // List the common cases
995     .legalForCartesianProduct(AddrSpaces64, {S64})
996     .legalForCartesianProduct(AddrSpaces32, {S32})
997     .scalarize(0)
998     // Accept any address space as long as the size matches
999     .legalIf(sameSize(0, 1))
1000     .widenScalarIf(smallerThan(0, 1),
1001       [](const LegalityQuery &Query) {
1002         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1003       })
1004     .narrowScalarIf(
1005       largerThan(0, 1),
1006       [](const LegalityQuery &Query) {
1007         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1008       });
1009 
1010   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1011     .scalarize(0)
1012     .custom();
1013 
1014   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1015                                     bool IsLoad) -> bool {
1016     const LLT DstTy = Query.Types[0];
1017 
1018     // Split vector extloads.
1019     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1021 
1022     if (MemSize < DstTy.getSizeInBits())
1023       MemSize = std::max(MemSize, AlignBits);
1024 
1025     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1026       return true;
1027 
1028     const LLT PtrTy = Query.Types[1];
1029     unsigned AS = PtrTy.getAddressSpace();
1030     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1031       return true;
1032 
1033     // Catch weird sized loads that don't evenly divide into the access sizes
1034     // TODO: May be able to widen depending on alignment etc.
1035     unsigned NumRegs = (MemSize + 31) / 32;
1036     if (NumRegs == 3) {
1037       if (!ST.hasDwordx3LoadStores())
1038         return true;
1039     } else {
1040       // If the alignment allows, these should have been widened.
1041       if (!isPowerOf2_32(NumRegs))
1042         return true;
1043     }
1044 
1045     if (AlignBits < MemSize) {
1046       const SITargetLowering *TLI = ST.getTargetLowering();
1047       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1048                                                       Align(AlignBits / 8));
1049     }
1050 
1051     return false;
1052   };
1053 
1054   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1055   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1056   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1057 
1058   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1059   // LDS
1060   // TODO: Unsupported flat for SI.
1061 
1062   for (unsigned Op : {G_LOAD, G_STORE}) {
1063     const bool IsStore = Op == G_STORE;
1064 
1065     auto &Actions = getActionDefinitionsBuilder(Op);
1066     // Explicitly list some common cases.
1067     // TODO: Does this help compile time at all?
1068     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1069                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1070                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1071                                       {S64, GlobalPtr, 64, GlobalAlign32},
1072                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1073                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1074                                       {S32, GlobalPtr, 8, GlobalAlign8},
1075                                       {S32, GlobalPtr, 16, GlobalAlign16},
1076 
1077                                       {S32, LocalPtr, 32, 32},
1078                                       {S64, LocalPtr, 64, 32},
1079                                       {V2S32, LocalPtr, 64, 32},
1080                                       {S32, LocalPtr, 8, 8},
1081                                       {S32, LocalPtr, 16, 16},
1082                                       {V2S16, LocalPtr, 32, 32},
1083 
1084                                       {S32, PrivatePtr, 32, 32},
1085                                       {S32, PrivatePtr, 8, 8},
1086                                       {S32, PrivatePtr, 16, 16},
1087                                       {V2S16, PrivatePtr, 32, 32},
1088 
1089                                       {S32, ConstantPtr, 32, GlobalAlign32},
1090                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1091                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1092                                       {S64, ConstantPtr, 64, GlobalAlign32},
1093                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1094     Actions.legalIf(
1095       [=](const LegalityQuery &Query) -> bool {
1096         return isLoadStoreLegal(ST, Query, Op);
1097       });
1098 
1099     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1100     // 64-bits.
1101     //
1102     // TODO: Should generalize bitcast action into coerce, which will also cover
1103     // inserting addrspacecasts.
1104     Actions.customIf(typeIs(1, Constant32Ptr));
1105 
1106     // Turn any illegal element vectors into something easier to deal
1107     // with. These will ultimately produce 32-bit scalar shifts to extract the
1108     // parts anyway.
1109     //
1110     // For odd 16-bit element vectors, prefer to split those into pieces with
1111     // 16-bit vector parts.
1112     Actions.bitcastIf(
1113       [=](const LegalityQuery &Query) -> bool {
1114         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1115                                           Query.MMODescrs[0].SizeInBits);
1116       }, bitcastToRegisterType(0));
1117 
1118     if (!IsStore) {
1119       // Widen suitably aligned loads by loading extra bytes. The standard
1120       // legalization actions can't properly express widening memory operands.
1121       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1122         return shouldWidenLoad(ST, Query, G_LOAD);
1123       });
1124     }
1125 
1126     // FIXME: load/store narrowing should be moved to lower action
1127     Actions
1128         .narrowScalarIf(
1129             [=](const LegalityQuery &Query) -> bool {
1130               return !Query.Types[0].isVector() &&
1131                      needToSplitMemOp(Query, Op == G_LOAD);
1132             },
1133             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1134               const LLT DstTy = Query.Types[0];
1135               const LLT PtrTy = Query.Types[1];
1136 
1137               const unsigned DstSize = DstTy.getSizeInBits();
1138               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1139 
1140               // Split extloads.
1141               if (DstSize > MemSize)
1142                 return std::make_pair(0, LLT::scalar(MemSize));
1143 
1144               if (!isPowerOf2_32(DstSize)) {
1145                 // We're probably decomposing an odd sized store. Try to split
1146                 // to the widest type. TODO: Account for alignment. As-is it
1147                 // should be OK, since the new parts will be further legalized.
1148                 unsigned FloorSize = PowerOf2Floor(DstSize);
1149                 return std::make_pair(0, LLT::scalar(FloorSize));
1150               }
1151 
1152               if (DstSize > 32 && (DstSize % 32 != 0)) {
1153                 // FIXME: Need a way to specify non-extload of larger size if
1154                 // suitably aligned.
1155                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1156               }
1157 
1158               unsigned MaxSize = maxSizeForAddrSpace(ST,
1159                                                      PtrTy.getAddressSpace(),
1160                                                      Op == G_LOAD);
1161               if (MemSize > MaxSize)
1162                 return std::make_pair(0, LLT::scalar(MaxSize));
1163 
1164               unsigned Align = Query.MMODescrs[0].AlignInBits;
1165               return std::make_pair(0, LLT::scalar(Align));
1166             })
1167         .fewerElementsIf(
1168             [=](const LegalityQuery &Query) -> bool {
1169               return Query.Types[0].isVector() &&
1170                      needToSplitMemOp(Query, Op == G_LOAD);
1171             },
1172             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1173               const LLT DstTy = Query.Types[0];
1174               const LLT PtrTy = Query.Types[1];
1175 
1176               LLT EltTy = DstTy.getElementType();
1177               unsigned MaxSize = maxSizeForAddrSpace(ST,
1178                                                      PtrTy.getAddressSpace(),
1179                                                      Op == G_LOAD);
1180 
1181               // FIXME: Handle widened to power of 2 results better. This ends
1182               // up scalarizing.
1183               // FIXME: 3 element stores scalarized on SI
1184 
1185               // Split if it's too large for the address space.
1186               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1187                 unsigned NumElts = DstTy.getNumElements();
1188                 unsigned EltSize = EltTy.getSizeInBits();
1189 
1190                 if (MaxSize % EltSize == 0) {
1191                   return std::make_pair(
1192                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1193                 }
1194 
1195                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1196 
1197                 // FIXME: Refine when odd breakdowns handled
1198                 // The scalars will need to be re-legalized.
1199                 if (NumPieces == 1 || NumPieces >= NumElts ||
1200                     NumElts % NumPieces != 0)
1201                   return std::make_pair(0, EltTy);
1202 
1203                 return std::make_pair(0,
1204                                       LLT::vector(NumElts / NumPieces, EltTy));
1205               }
1206 
1207               // FIXME: We could probably handle weird extending loads better.
1208               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1209               if (DstTy.getSizeInBits() > MemSize)
1210                 return std::make_pair(0, EltTy);
1211 
1212               unsigned EltSize = EltTy.getSizeInBits();
1213               unsigned DstSize = DstTy.getSizeInBits();
1214               if (!isPowerOf2_32(DstSize)) {
1215                 // We're probably decomposing an odd sized store. Try to split
1216                 // to the widest type. TODO: Account for alignment. As-is it
1217                 // should be OK, since the new parts will be further legalized.
1218                 unsigned FloorSize = PowerOf2Floor(DstSize);
1219                 return std::make_pair(
1220                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1221               }
1222 
1223               // Need to split because of alignment.
1224               unsigned Align = Query.MMODescrs[0].AlignInBits;
1225               if (EltSize > Align &&
1226                   (EltSize / Align < DstTy.getNumElements())) {
1227                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1228               }
1229 
1230               // May need relegalization for the scalars.
1231               return std::make_pair(0, EltTy);
1232             })
1233     .lowerIfMemSizeNotPow2()
1234     .minScalar(0, S32);
1235 
1236     if (IsStore)
1237       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1238 
1239     Actions
1240         .widenScalarToNextPow2(0)
1241         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1242         .lower();
1243   }
1244 
1245   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1246                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1247                                                   {S32, GlobalPtr, 16, 2 * 8},
1248                                                   {S32, LocalPtr, 8, 8},
1249                                                   {S32, LocalPtr, 16, 16},
1250                                                   {S32, PrivatePtr, 8, 8},
1251                                                   {S32, PrivatePtr, 16, 16},
1252                                                   {S32, ConstantPtr, 8, 8},
1253                                                   {S32, ConstantPtr, 16, 2 * 8}});
1254   if (ST.hasFlatAddressSpace()) {
1255     ExtLoads.legalForTypesWithMemDesc(
1256         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1257   }
1258 
1259   ExtLoads.clampScalar(0, S32, S32)
1260           .widenScalarToNextPow2(0)
1261           .unsupportedIfMemSizeNotPow2()
1262           .lower();
1263 
1264   auto &Atomics = getActionDefinitionsBuilder(
1265     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1266      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1267      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1268      G_ATOMICRMW_UMIN})
1269     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1270                {S64, GlobalPtr}, {S64, LocalPtr},
1271                {S32, RegionPtr}, {S64, RegionPtr}});
1272   if (ST.hasFlatAddressSpace()) {
1273     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1274   }
1275 
1276   if (ST.hasLDSFPAtomics()) {
1277     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1278       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1279   }
1280 
1281   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1282   // demarshalling
1283   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1284     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1285                 {S32, FlatPtr}, {S64, FlatPtr}})
1286     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1287                {S32, RegionPtr}, {S64, RegionPtr}});
1288   // TODO: Pointer types, any 32-bit or 64-bit vector
1289 
1290   // Condition should be s32 for scalar, s1 for vector.
1291   getActionDefinitionsBuilder(G_SELECT)
1292     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1293           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1294           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1295     .clampScalar(0, S16, S64)
1296     .scalarize(1)
1297     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1298     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1299     .clampMaxNumElements(0, S32, 2)
1300     .clampMaxNumElements(0, LocalPtr, 2)
1301     .clampMaxNumElements(0, PrivatePtr, 2)
1302     .scalarize(0)
1303     .widenScalarToNextPow2(0)
1304     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1305 
1306   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1307   // be more flexible with the shift amount type.
1308   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1309     .legalFor({{S32, S32}, {S64, S32}});
1310   if (ST.has16BitInsts()) {
1311     if (ST.hasVOP3PInsts()) {
1312       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1313             .clampMaxNumElements(0, S16, 2);
1314     } else
1315       Shifts.legalFor({{S16, S16}});
1316 
1317     // TODO: Support 16-bit shift amounts for all types
1318     Shifts.widenScalarIf(
1319       [=](const LegalityQuery &Query) {
1320         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1321         // 32-bit amount.
1322         const LLT ValTy = Query.Types[0];
1323         const LLT AmountTy = Query.Types[1];
1324         return ValTy.getSizeInBits() <= 16 &&
1325                AmountTy.getSizeInBits() < 16;
1326       }, changeTo(1, S16));
1327     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1328     Shifts.clampScalar(1, S32, S32);
1329     Shifts.clampScalar(0, S16, S64);
1330     Shifts.widenScalarToNextPow2(0, 16);
1331 
1332     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1333       .minScalar(0, S16)
1334       .scalarize(0)
1335       .lower();
1336   } else {
1337     // Make sure we legalize the shift amount type first, as the general
1338     // expansion for the shifted type will produce much worse code if it hasn't
1339     // been truncated already.
1340     Shifts.clampScalar(1, S32, S32);
1341     Shifts.clampScalar(0, S32, S64);
1342     Shifts.widenScalarToNextPow2(0, 32);
1343 
1344     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1345       .minScalar(0, S32)
1346       .scalarize(0)
1347       .lower();
1348   }
1349   Shifts.scalarize(0);
1350 
1351   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1352     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1353     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1354     unsigned IdxTypeIdx = 2;
1355 
1356     getActionDefinitionsBuilder(Op)
1357       .customIf([=](const LegalityQuery &Query) {
1358           const LLT EltTy = Query.Types[EltTypeIdx];
1359           const LLT VecTy = Query.Types[VecTypeIdx];
1360           const LLT IdxTy = Query.Types[IdxTypeIdx];
1361           const unsigned EltSize = EltTy.getSizeInBits();
1362           return (EltSize == 32 || EltSize == 64) &&
1363                   VecTy.getSizeInBits() % 32 == 0 &&
1364                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1365                   IdxTy.getSizeInBits() == 32;
1366         })
1367       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1368                  bitcastToVectorElement32(VecTypeIdx))
1369       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1370       .bitcastIf(
1371         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1372         [=](const LegalityQuery &Query) {
1373           // For > 64-bit element types, try to turn this into a 64-bit
1374           // element vector since we may be able to do better indexing
1375           // if this is scalar. If not, fall back to 32.
1376           const LLT EltTy = Query.Types[EltTypeIdx];
1377           const LLT VecTy = Query.Types[VecTypeIdx];
1378           const unsigned DstEltSize = EltTy.getSizeInBits();
1379           const unsigned VecSize = VecTy.getSizeInBits();
1380 
1381           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1382           return std::make_pair(
1383             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1384         })
1385       .clampScalar(EltTypeIdx, S32, S64)
1386       .clampScalar(VecTypeIdx, S32, S64)
1387       .clampScalar(IdxTypeIdx, S32, S32)
1388       .clampMaxNumElements(VecTypeIdx, S32, 32)
1389       // TODO: Clamp elements for 64-bit vectors?
1390       // It should only be necessary with variable indexes.
1391       // As a last resort, lower to the stack
1392       .lower();
1393   }
1394 
1395   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1396     .unsupportedIf([=](const LegalityQuery &Query) {
1397         const LLT &EltTy = Query.Types[1].getElementType();
1398         return Query.Types[0] != EltTy;
1399       });
1400 
1401   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1402     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1403     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1404 
1405     // FIXME: Doesn't handle extract of illegal sizes.
1406     getActionDefinitionsBuilder(Op)
1407       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1408       // FIXME: Multiples of 16 should not be legal.
1409       .legalIf([=](const LegalityQuery &Query) {
1410           const LLT BigTy = Query.Types[BigTyIdx];
1411           const LLT LitTy = Query.Types[LitTyIdx];
1412           return (BigTy.getSizeInBits() % 32 == 0) &&
1413                  (LitTy.getSizeInBits() % 16 == 0);
1414         })
1415       .widenScalarIf(
1416         [=](const LegalityQuery &Query) {
1417           const LLT BigTy = Query.Types[BigTyIdx];
1418           return (BigTy.getScalarSizeInBits() < 16);
1419         },
1420         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1421       .widenScalarIf(
1422         [=](const LegalityQuery &Query) {
1423           const LLT LitTy = Query.Types[LitTyIdx];
1424           return (LitTy.getScalarSizeInBits() < 16);
1425         },
1426         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1427       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1428       .widenScalarToNextPow2(BigTyIdx, 32);
1429 
1430   }
1431 
1432   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1433     .legalForCartesianProduct(AllS32Vectors, {S32})
1434     .legalForCartesianProduct(AllS64Vectors, {S64})
1435     .clampNumElements(0, V16S32, V32S32)
1436     .clampNumElements(0, V2S64, V16S64)
1437     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1438 
1439   if (ST.hasScalarPackInsts()) {
1440     BuildVector
1441       // FIXME: Should probably widen s1 vectors straight to s32
1442       .minScalarOrElt(0, S16)
1443       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1444       .minScalar(1, S32);
1445 
1446     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1447       .legalFor({V2S16, S32})
1448       .lower();
1449     BuildVector.minScalarOrElt(0, S32);
1450   } else {
1451     BuildVector.customFor({V2S16, S16});
1452     BuildVector.minScalarOrElt(0, S32);
1453 
1454     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1455       .customFor({V2S16, S32})
1456       .lower();
1457   }
1458 
1459   BuildVector.legalIf(isRegisterType(0));
1460 
1461   // FIXME: Clamp maximum size
1462   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1463     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1464     .clampMaxNumElements(0, S32, 32)
1465     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1466     .clampMaxNumElements(0, S16, 64);
1467 
1468   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1469   // pre-legalize.
1470   if (ST.hasVOP3PInsts()) {
1471     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1472       .customFor({V2S16, V2S16})
1473       .lower();
1474   } else
1475     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1476 
1477   // Merge/Unmerge
1478   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1479     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1480     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1481 
1482     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1483       const LLT Ty = Query.Types[TypeIdx];
1484       if (Ty.isVector()) {
1485         const LLT &EltTy = Ty.getElementType();
1486         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1487           return true;
1488         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1489           return true;
1490       }
1491       return false;
1492     };
1493 
1494     auto &Builder = getActionDefinitionsBuilder(Op)
1495       .lowerFor({{S16, V2S16}})
1496       .lowerIf([=](const LegalityQuery &Query) {
1497           const LLT BigTy = Query.Types[BigTyIdx];
1498           return BigTy.getSizeInBits() == 32;
1499         })
1500       // Try to widen to s16 first for small types.
1501       // TODO: Only do this on targets with legal s16 shifts
1502       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1503       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1504       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1505       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1506                            elementTypeIs(1, S16)),
1507                        changeTo(1, V2S16))
1508       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1509       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1510       // valid.
1511       .clampScalar(LitTyIdx, S32, S512)
1512       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1513       // Break up vectors with weird elements into scalars
1514       .fewerElementsIf(
1515         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1516         scalarize(0))
1517       .fewerElementsIf(
1518         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1519         scalarize(1))
1520       .clampScalar(BigTyIdx, S32, MaxScalar);
1521 
1522     if (Op == G_MERGE_VALUES) {
1523       Builder.widenScalarIf(
1524         // TODO: Use 16-bit shifts if legal for 8-bit values?
1525         [=](const LegalityQuery &Query) {
1526           const LLT Ty = Query.Types[LitTyIdx];
1527           return Ty.getSizeInBits() < 32;
1528         },
1529         changeTo(LitTyIdx, S32));
1530     }
1531 
1532     Builder.widenScalarIf(
1533       [=](const LegalityQuery &Query) {
1534         const LLT Ty = Query.Types[BigTyIdx];
1535         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1536           Ty.getSizeInBits() % 16 != 0;
1537       },
1538       [=](const LegalityQuery &Query) {
1539         // Pick the next power of 2, or a multiple of 64 over 128.
1540         // Whichever is smaller.
1541         const LLT &Ty = Query.Types[BigTyIdx];
1542         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1543         if (NewSizeInBits >= 256) {
1544           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1545           if (RoundedTo < NewSizeInBits)
1546             NewSizeInBits = RoundedTo;
1547         }
1548         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1549       })
1550       .legalIf([=](const LegalityQuery &Query) {
1551           const LLT &BigTy = Query.Types[BigTyIdx];
1552           const LLT &LitTy = Query.Types[LitTyIdx];
1553 
1554           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1555             return false;
1556           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1557             return false;
1558 
1559           return BigTy.getSizeInBits() % 16 == 0 &&
1560                  LitTy.getSizeInBits() % 16 == 0 &&
1561                  BigTy.getSizeInBits() <= MaxRegisterSize;
1562         })
1563       // Any vectors left are the wrong size. Scalarize them.
1564       .scalarize(0)
1565       .scalarize(1);
1566   }
1567 
1568   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1569   // RegBankSelect.
1570   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1571     .legalFor({{S32}, {S64}});
1572 
1573   if (ST.hasVOP3PInsts()) {
1574     SextInReg.lowerFor({{V2S16}})
1575       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1576       // get more vector shift opportunities, since we'll get those when
1577       // expanded.
1578       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1579   } else if (ST.has16BitInsts()) {
1580     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1581   } else {
1582     // Prefer to promote to s32 before lowering if we don't have 16-bit
1583     // shifts. This avoid a lot of intermediate truncate and extend operations.
1584     SextInReg.lowerFor({{S32}, {S64}});
1585   }
1586 
1587   SextInReg
1588     .scalarize(0)
1589     .clampScalar(0, S32, S64)
1590     .lower();
1591 
1592   getActionDefinitionsBuilder(G_FSHR)
1593     .legalFor({{S32, S32}})
1594     .scalarize(0)
1595     .lower();
1596 
1597   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1598     .legalFor({S64});
1599 
1600   getActionDefinitionsBuilder(G_FENCE)
1601     .alwaysLegal();
1602 
1603   getActionDefinitionsBuilder({
1604       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1605       G_FCOPYSIGN,
1606 
1607       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1608       G_ATOMICRMW_NAND,
1609       G_ATOMICRMW_FSUB,
1610       G_READ_REGISTER,
1611       G_WRITE_REGISTER,
1612 
1613       G_SADDO, G_SSUBO,
1614 
1615        // TODO: Implement
1616       G_FMINIMUM, G_FMAXIMUM,
1617       G_FSHL
1618     }).lower();
1619 
1620   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1621         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1622         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1623     .unsupported();
1624 
1625   computeTables();
1626   verify(*ST.getInstrInfo());
1627 }
1628 
1629 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1630                                          MachineInstr &MI) const {
1631   MachineIRBuilder &B = Helper.MIRBuilder;
1632   MachineRegisterInfo &MRI = *B.getMRI();
1633 
1634   switch (MI.getOpcode()) {
1635   case TargetOpcode::G_ADDRSPACE_CAST:
1636     return legalizeAddrSpaceCast(MI, MRI, B);
1637   case TargetOpcode::G_FRINT:
1638     return legalizeFrint(MI, MRI, B);
1639   case TargetOpcode::G_FCEIL:
1640     return legalizeFceil(MI, MRI, B);
1641   case TargetOpcode::G_FREM:
1642     return legalizeFrem(MI, MRI, B);
1643   case TargetOpcode::G_INTRINSIC_TRUNC:
1644     return legalizeIntrinsicTrunc(MI, MRI, B);
1645   case TargetOpcode::G_SITOFP:
1646     return legalizeITOFP(MI, MRI, B, true);
1647   case TargetOpcode::G_UITOFP:
1648     return legalizeITOFP(MI, MRI, B, false);
1649   case TargetOpcode::G_FPTOSI:
1650     return legalizeFPTOI(MI, MRI, B, true);
1651   case TargetOpcode::G_FPTOUI:
1652     return legalizeFPTOI(MI, MRI, B, false);
1653   case TargetOpcode::G_FMINNUM:
1654   case TargetOpcode::G_FMAXNUM:
1655   case TargetOpcode::G_FMINNUM_IEEE:
1656   case TargetOpcode::G_FMAXNUM_IEEE:
1657     return legalizeMinNumMaxNum(Helper, MI);
1658   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1659     return legalizeExtractVectorElt(MI, MRI, B);
1660   case TargetOpcode::G_INSERT_VECTOR_ELT:
1661     return legalizeInsertVectorElt(MI, MRI, B);
1662   case TargetOpcode::G_SHUFFLE_VECTOR:
1663     return legalizeShuffleVector(MI, MRI, B);
1664   case TargetOpcode::G_FSIN:
1665   case TargetOpcode::G_FCOS:
1666     return legalizeSinCos(MI, MRI, B);
1667   case TargetOpcode::G_GLOBAL_VALUE:
1668     return legalizeGlobalValue(MI, MRI, B);
1669   case TargetOpcode::G_LOAD:
1670     return legalizeLoad(Helper, MI);
1671   case TargetOpcode::G_FMAD:
1672     return legalizeFMad(MI, MRI, B);
1673   case TargetOpcode::G_FDIV:
1674     return legalizeFDIV(MI, MRI, B);
1675   case TargetOpcode::G_UDIV:
1676   case TargetOpcode::G_UREM:
1677     return legalizeUDIV_UREM(MI, MRI, B);
1678   case TargetOpcode::G_SDIV:
1679   case TargetOpcode::G_SREM:
1680     return legalizeSDIV_SREM(MI, MRI, B);
1681   case TargetOpcode::G_ATOMIC_CMPXCHG:
1682     return legalizeAtomicCmpXChg(MI, MRI, B);
1683   case TargetOpcode::G_FLOG:
1684     return legalizeFlog(MI, B, numbers::ln2f);
1685   case TargetOpcode::G_FLOG10:
1686     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1687   case TargetOpcode::G_FEXP:
1688     return legalizeFExp(MI, B);
1689   case TargetOpcode::G_FPOW:
1690     return legalizeFPow(MI, B);
1691   case TargetOpcode::G_FFLOOR:
1692     return legalizeFFloor(MI, MRI, B);
1693   case TargetOpcode::G_BUILD_VECTOR:
1694     return legalizeBuildVector(MI, MRI, B);
1695   default:
1696     return false;
1697   }
1698 
1699   llvm_unreachable("expected switch to return");
1700 }
1701 
1702 Register AMDGPULegalizerInfo::getSegmentAperture(
1703   unsigned AS,
1704   MachineRegisterInfo &MRI,
1705   MachineIRBuilder &B) const {
1706   MachineFunction &MF = B.getMF();
1707   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1708   const LLT S32 = LLT::scalar(32);
1709 
1710   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1711 
1712   if (ST.hasApertureRegs()) {
1713     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1714     // getreg.
1715     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1716         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1717         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1718     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1719         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1720         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1721     unsigned Encoding =
1722         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1723         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1724         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1725 
1726     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1727 
1728     B.buildInstr(AMDGPU::S_GETREG_B32)
1729       .addDef(GetReg)
1730       .addImm(Encoding);
1731     MRI.setType(GetReg, S32);
1732 
1733     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1734     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1735   }
1736 
1737   Register QueuePtr = MRI.createGenericVirtualRegister(
1738     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1739 
1740   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1741     return Register();
1742 
1743   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1744   // private_segment_aperture_base_hi.
1745   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1746 
1747   // TODO: can we be smarter about machine pointer info?
1748   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1749   MachineMemOperand *MMO = MF.getMachineMemOperand(
1750       PtrInfo,
1751       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1752           MachineMemOperand::MOInvariant,
1753       4, commonAlignment(Align(64), StructOffset));
1754 
1755   Register LoadAddr;
1756 
1757   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1758   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1759 }
1760 
1761 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1762   MachineInstr &MI, MachineRegisterInfo &MRI,
1763   MachineIRBuilder &B) const {
1764   MachineFunction &MF = B.getMF();
1765 
1766   const LLT S32 = LLT::scalar(32);
1767   Register Dst = MI.getOperand(0).getReg();
1768   Register Src = MI.getOperand(1).getReg();
1769 
1770   LLT DstTy = MRI.getType(Dst);
1771   LLT SrcTy = MRI.getType(Src);
1772   unsigned DestAS = DstTy.getAddressSpace();
1773   unsigned SrcAS = SrcTy.getAddressSpace();
1774 
1775   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1776   // vector element.
1777   assert(!DstTy.isVector());
1778 
1779   const AMDGPUTargetMachine &TM
1780     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1781 
1782   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1783     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1784     return true;
1785   }
1786 
1787   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1788     // Truncate.
1789     B.buildExtract(Dst, Src, 0);
1790     MI.eraseFromParent();
1791     return true;
1792   }
1793 
1794   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1795     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1796     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1797 
1798     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1799     // another. Merge operands are required to be the same type, but creating an
1800     // extra ptrtoint would be kind of pointless.
1801     auto HighAddr = B.buildConstant(
1802       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1803     B.buildMerge(Dst, {Src, HighAddr});
1804     MI.eraseFromParent();
1805     return true;
1806   }
1807 
1808   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1809     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1810            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1811     unsigned NullVal = TM.getNullPointerValue(DestAS);
1812 
1813     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1814     auto FlatNull = B.buildConstant(SrcTy, 0);
1815 
1816     // Extract low 32-bits of the pointer.
1817     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1818 
1819     auto CmpRes =
1820         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1821     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1822 
1823     MI.eraseFromParent();
1824     return true;
1825   }
1826 
1827   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1828     return false;
1829 
1830   if (!ST.hasFlatAddressSpace())
1831     return false;
1832 
1833   auto SegmentNull =
1834       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1835   auto FlatNull =
1836       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1837 
1838   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1839   if (!ApertureReg.isValid())
1840     return false;
1841 
1842   auto CmpRes =
1843       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1844 
1845   // Coerce the type of the low half of the result so we can use merge_values.
1846   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1847 
1848   // TODO: Should we allow mismatched types but matching sizes in merges to
1849   // avoid the ptrtoint?
1850   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1851   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1852 
1853   MI.eraseFromParent();
1854   return true;
1855 }
1856 
1857 bool AMDGPULegalizerInfo::legalizeFrint(
1858   MachineInstr &MI, MachineRegisterInfo &MRI,
1859   MachineIRBuilder &B) const {
1860   Register Src = MI.getOperand(1).getReg();
1861   LLT Ty = MRI.getType(Src);
1862   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1863 
1864   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1865   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1866 
1867   auto C1 = B.buildFConstant(Ty, C1Val);
1868   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1869 
1870   // TODO: Should this propagate fast-math-flags?
1871   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1872   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1873 
1874   auto C2 = B.buildFConstant(Ty, C2Val);
1875   auto Fabs = B.buildFAbs(Ty, Src);
1876 
1877   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1878   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeFceil(
1884   MachineInstr &MI, MachineRegisterInfo &MRI,
1885   MachineIRBuilder &B) const {
1886 
1887   const LLT S1 = LLT::scalar(1);
1888   const LLT S64 = LLT::scalar(64);
1889 
1890   Register Src = MI.getOperand(1).getReg();
1891   assert(MRI.getType(Src) == S64);
1892 
1893   // result = trunc(src)
1894   // if (src > 0.0 && src != result)
1895   //   result += 1.0
1896 
1897   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1898 
1899   const auto Zero = B.buildFConstant(S64, 0.0);
1900   const auto One = B.buildFConstant(S64, 1.0);
1901   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1902   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1903   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1904   auto Add = B.buildSelect(S64, And, One, Zero);
1905 
1906   // TODO: Should this propagate fast-math-flags?
1907   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeFrem(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914     Register DstReg = MI.getOperand(0).getReg();
1915     Register Src0Reg = MI.getOperand(1).getReg();
1916     Register Src1Reg = MI.getOperand(2).getReg();
1917     auto Flags = MI.getFlags();
1918     LLT Ty = MRI.getType(DstReg);
1919 
1920     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1921     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1922     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1923     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1924     MI.eraseFromParent();
1925     return true;
1926 }
1927 
1928 static MachineInstrBuilder extractF64Exponent(Register Hi,
1929                                               MachineIRBuilder &B) {
1930   const unsigned FractBits = 52;
1931   const unsigned ExpBits = 11;
1932   LLT S32 = LLT::scalar(32);
1933 
1934   auto Const0 = B.buildConstant(S32, FractBits - 32);
1935   auto Const1 = B.buildConstant(S32, ExpBits);
1936 
1937   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1938     .addUse(Hi)
1939     .addUse(Const0.getReg(0))
1940     .addUse(Const1.getReg(0));
1941 
1942   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   const LLT S1 = LLT::scalar(1);
1949   const LLT S32 = LLT::scalar(32);
1950   const LLT S64 = LLT::scalar(64);
1951 
1952   Register Src = MI.getOperand(1).getReg();
1953   assert(MRI.getType(Src) == S64);
1954 
1955   // TODO: Should this use extract since the low half is unused?
1956   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1957   Register Hi = Unmerge.getReg(1);
1958 
1959   // Extract the upper half, since this is where we will find the sign and
1960   // exponent.
1961   auto Exp = extractF64Exponent(Hi, B);
1962 
1963   const unsigned FractBits = 52;
1964 
1965   // Extract the sign bit.
1966   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1967   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1968 
1969   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1970 
1971   const auto Zero32 = B.buildConstant(S32, 0);
1972 
1973   // Extend back to 64-bits.
1974   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1975 
1976   auto Shr = B.buildAShr(S64, FractMask, Exp);
1977   auto Not = B.buildNot(S64, Shr);
1978   auto Tmp0 = B.buildAnd(S64, Src, Not);
1979   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1980 
1981   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1982   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1983 
1984   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1985   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1986   MI.eraseFromParent();
1987   return true;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::legalizeITOFP(
1991   MachineInstr &MI, MachineRegisterInfo &MRI,
1992   MachineIRBuilder &B, bool Signed) const {
1993 
1994   Register Dst = MI.getOperand(0).getReg();
1995   Register Src = MI.getOperand(1).getReg();
1996 
1997   const LLT S64 = LLT::scalar(64);
1998   const LLT S32 = LLT::scalar(32);
1999 
2000   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2001 
2002   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2003 
2004   auto CvtHi = Signed ?
2005     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2006     B.buildUITOFP(S64, Unmerge.getReg(1));
2007 
2008   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2009 
2010   auto ThirtyTwo = B.buildConstant(S32, 32);
2011   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2012     .addUse(CvtHi.getReg(0))
2013     .addUse(ThirtyTwo.getReg(0));
2014 
2015   // TODO: Should this propagate fast-math-flags?
2016   B.buildFAdd(Dst, LdExp, CvtLo);
2017   MI.eraseFromParent();
2018   return true;
2019 }
2020 
2021 // TODO: Copied from DAG implementation. Verify logic and document how this
2022 // actually works.
2023 bool AMDGPULegalizerInfo::legalizeFPTOI(
2024   MachineInstr &MI, MachineRegisterInfo &MRI,
2025   MachineIRBuilder &B, bool Signed) const {
2026 
2027   Register Dst = MI.getOperand(0).getReg();
2028   Register Src = MI.getOperand(1).getReg();
2029 
2030   const LLT S64 = LLT::scalar(64);
2031   const LLT S32 = LLT::scalar(32);
2032 
2033   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2034 
2035   unsigned Flags = MI.getFlags();
2036 
2037   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2038   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2039   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2040 
2041   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2042   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2043   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2044 
2045   auto Hi = Signed ?
2046     B.buildFPTOSI(S32, FloorMul) :
2047     B.buildFPTOUI(S32, FloorMul);
2048   auto Lo = B.buildFPTOUI(S32, Fma);
2049 
2050   B.buildMerge(Dst, { Lo, Hi });
2051   MI.eraseFromParent();
2052 
2053   return true;
2054 }
2055 
2056 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2057                                                MachineInstr &MI) const {
2058   MachineFunction &MF = Helper.MIRBuilder.getMF();
2059   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2060 
2061   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2062                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2063 
2064   // With ieee_mode disabled, the instructions have the correct behavior
2065   // already for G_FMINNUM/G_FMAXNUM
2066   if (!MFI->getMode().IEEE)
2067     return !IsIEEEOp;
2068 
2069   if (IsIEEEOp)
2070     return true;
2071 
2072   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2073 }
2074 
2075 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2076   MachineInstr &MI, MachineRegisterInfo &MRI,
2077   MachineIRBuilder &B) const {
2078   // TODO: Should move some of this into LegalizerHelper.
2079 
2080   // TODO: Promote dynamic indexing of s16 to s32
2081 
2082   // FIXME: Artifact combiner probably should have replaced the truncated
2083   // constant before this, so we shouldn't need
2084   // getConstantVRegValWithLookThrough.
2085   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2086     MI.getOperand(2).getReg(), MRI);
2087   if (!IdxVal) // Dynamic case will be selected to register indexing.
2088     return true;
2089 
2090   Register Dst = MI.getOperand(0).getReg();
2091   Register Vec = MI.getOperand(1).getReg();
2092 
2093   LLT VecTy = MRI.getType(Vec);
2094   LLT EltTy = VecTy.getElementType();
2095   assert(EltTy == MRI.getType(Dst));
2096 
2097   if (IdxVal->Value < VecTy.getNumElements())
2098     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2099   else
2100     B.buildUndef(Dst);
2101 
2102   MI.eraseFromParent();
2103   return true;
2104 }
2105 
2106 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2107   MachineInstr &MI, MachineRegisterInfo &MRI,
2108   MachineIRBuilder &B) const {
2109   // TODO: Should move some of this into LegalizerHelper.
2110 
2111   // TODO: Promote dynamic indexing of s16 to s32
2112 
2113   // FIXME: Artifact combiner probably should have replaced the truncated
2114   // constant before this, so we shouldn't need
2115   // getConstantVRegValWithLookThrough.
2116   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2117     MI.getOperand(3).getReg(), MRI);
2118   if (!IdxVal) // Dynamic case will be selected to register indexing.
2119     return true;
2120 
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register Vec = MI.getOperand(1).getReg();
2123   Register Ins = MI.getOperand(2).getReg();
2124 
2125   LLT VecTy = MRI.getType(Vec);
2126   LLT EltTy = VecTy.getElementType();
2127   assert(EltTy == MRI.getType(Ins));
2128 
2129   if (IdxVal->Value < VecTy.getNumElements())
2130     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2131   else
2132     B.buildUndef(Dst);
2133 
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2139   MachineInstr &MI, MachineRegisterInfo &MRI,
2140   MachineIRBuilder &B) const {
2141   const LLT V2S16 = LLT::vector(2, 16);
2142 
2143   Register Dst = MI.getOperand(0).getReg();
2144   Register Src0 = MI.getOperand(1).getReg();
2145   LLT DstTy = MRI.getType(Dst);
2146   LLT SrcTy = MRI.getType(Src0);
2147 
2148   if (SrcTy == V2S16 && DstTy == V2S16 &&
2149       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2150     return true;
2151 
2152   MachineIRBuilder HelperBuilder(MI);
2153   GISelObserverWrapper DummyObserver;
2154   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2155   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeSinCos(
2159   MachineInstr &MI, MachineRegisterInfo &MRI,
2160   MachineIRBuilder &B) const {
2161 
2162   Register DstReg = MI.getOperand(0).getReg();
2163   Register SrcReg = MI.getOperand(1).getReg();
2164   LLT Ty = MRI.getType(DstReg);
2165   unsigned Flags = MI.getFlags();
2166 
2167   Register TrigVal;
2168   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2169   if (ST.hasTrigReducedRange()) {
2170     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2171     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2172       .addUse(MulVal.getReg(0))
2173       .setMIFlags(Flags).getReg(0);
2174   } else
2175     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2176 
2177   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2178     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2179   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2180     .addUse(TrigVal)
2181     .setMIFlags(Flags);
2182   MI.eraseFromParent();
2183   return true;
2184 }
2185 
2186 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2187                                                   MachineIRBuilder &B,
2188                                                   const GlobalValue *GV,
2189                                                   int64_t Offset,
2190                                                   unsigned GAFlags) const {
2191   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2192   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2193   // to the following code sequence:
2194   //
2195   // For constant address space:
2196   //   s_getpc_b64 s[0:1]
2197   //   s_add_u32 s0, s0, $symbol
2198   //   s_addc_u32 s1, s1, 0
2199   //
2200   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2201   //   a fixup or relocation is emitted to replace $symbol with a literal
2202   //   constant, which is a pc-relative offset from the encoding of the $symbol
2203   //   operand to the global variable.
2204   //
2205   // For global address space:
2206   //   s_getpc_b64 s[0:1]
2207   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2208   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2209   //
2210   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2211   //   fixups or relocations are emitted to replace $symbol@*@lo and
2212   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2213   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2214   //   operand to the global variable.
2215   //
2216   // What we want here is an offset from the value returned by s_getpc
2217   // (which is the address of the s_add_u32 instruction) to the global
2218   // variable, but since the encoding of $symbol starts 4 bytes after the start
2219   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2220   // small. This requires us to add 4 to the global variable offset in order to
2221   // compute the correct address.
2222 
2223   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2224 
2225   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2226     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2227 
2228   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2229     .addDef(PCReg);
2230 
2231   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2232   if (GAFlags == SIInstrInfo::MO_NONE)
2233     MIB.addImm(0);
2234   else
2235     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2236 
2237   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2238 
2239   if (PtrTy.getSizeInBits() == 32)
2240     B.buildExtract(DstReg, PCReg, 0);
2241   return true;
2242  }
2243 
2244 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2245   MachineInstr &MI, MachineRegisterInfo &MRI,
2246   MachineIRBuilder &B) const {
2247   Register DstReg = MI.getOperand(0).getReg();
2248   LLT Ty = MRI.getType(DstReg);
2249   unsigned AS = Ty.getAddressSpace();
2250 
2251   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2252   MachineFunction &MF = B.getMF();
2253   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2254 
2255   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2256     if (!MFI->isEntryFunction()) {
2257       const Function &Fn = MF.getFunction();
2258       DiagnosticInfoUnsupported BadLDSDecl(
2259         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2260         DS_Warning);
2261       Fn.getContext().diagnose(BadLDSDecl);
2262 
2263       // We currently don't have a way to correctly allocate LDS objects that
2264       // aren't directly associated with a kernel. We do force inlining of
2265       // functions that use local objects. However, if these dead functions are
2266       // not eliminated, we don't want a compile time error. Just emit a warning
2267       // and a trap, since there should be no callable path here.
2268       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2269       B.buildUndef(DstReg);
2270       MI.eraseFromParent();
2271       return true;
2272     }
2273 
2274     // TODO: We could emit code to handle the initialization somewhere.
2275     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2276       const SITargetLowering *TLI = ST.getTargetLowering();
2277       if (!TLI->shouldUseLDSConstAddress(GV)) {
2278         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2279         return true; // Leave in place;
2280       }
2281 
2282       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2283         Type *Ty = GV->getValueType();
2284         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2285         // zero-sized type in other languages to declare the dynamic shared
2286         // memory which size is not known at the compile time. They will be
2287         // allocated by the runtime and placed directly after the static
2288         // allocated ones. They all share the same offset.
2289         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2290           // Adjust alignment for that dynamic shared memory array.
2291           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2292           LLT S32 = LLT::scalar(32);
2293           auto Sz =
2294               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2295           B.buildIntToPtr(DstReg, Sz);
2296           MI.eraseFromParent();
2297           return true;
2298         }
2299       }
2300 
2301       B.buildConstant(
2302           DstReg,
2303           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2304       MI.eraseFromParent();
2305       return true;
2306     }
2307 
2308     const Function &Fn = MF.getFunction();
2309     DiagnosticInfoUnsupported BadInit(
2310       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2311     Fn.getContext().diagnose(BadInit);
2312     return true;
2313   }
2314 
2315   const SITargetLowering *TLI = ST.getTargetLowering();
2316 
2317   if (TLI->shouldEmitFixup(GV)) {
2318     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2319     MI.eraseFromParent();
2320     return true;
2321   }
2322 
2323   if (TLI->shouldEmitPCReloc(GV)) {
2324     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2325     MI.eraseFromParent();
2326     return true;
2327   }
2328 
2329   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2330   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2331 
2332   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2333       MachinePointerInfo::getGOT(MF),
2334       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2335           MachineMemOperand::MOInvariant,
2336       8 /*Size*/, Align(8));
2337 
2338   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2339 
2340   if (Ty.getSizeInBits() == 32) {
2341     // Truncate if this is a 32-bit constant adrdess.
2342     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2343     B.buildExtract(DstReg, Load, 0);
2344   } else
2345     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2346 
2347   MI.eraseFromParent();
2348   return true;
2349 }
2350 
2351 static LLT widenToNextPowerOf2(LLT Ty) {
2352   if (Ty.isVector())
2353     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2354   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2355 }
2356 
2357 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2358                                        MachineInstr &MI) const {
2359   MachineIRBuilder &B = Helper.MIRBuilder;
2360   MachineRegisterInfo &MRI = *B.getMRI();
2361   GISelChangeObserver &Observer = Helper.Observer;
2362 
2363   Register PtrReg = MI.getOperand(1).getReg();
2364   LLT PtrTy = MRI.getType(PtrReg);
2365   unsigned AddrSpace = PtrTy.getAddressSpace();
2366 
2367   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2368     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2369     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2370     Observer.changingInstr(MI);
2371     MI.getOperand(1).setReg(Cast.getReg(0));
2372     Observer.changedInstr(MI);
2373     return true;
2374   }
2375 
2376   Register ValReg = MI.getOperand(0).getReg();
2377   LLT ValTy = MRI.getType(ValReg);
2378 
2379   MachineMemOperand *MMO = *MI.memoperands_begin();
2380   const unsigned ValSize = ValTy.getSizeInBits();
2381   const unsigned MemSize = 8 * MMO->getSize();
2382   const Align MemAlign = MMO->getAlign();
2383   const unsigned AlignInBits = 8 * MemAlign.value();
2384 
2385   // Widen non-power-of-2 loads to the alignment if needed
2386   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2387     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2388 
2389     // This was already the correct extending load result type, so just adjust
2390     // the memory type.
2391     if (WideMemSize == ValSize) {
2392       MachineFunction &MF = B.getMF();
2393 
2394       MachineMemOperand *WideMMO =
2395           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2396       Observer.changingInstr(MI);
2397       MI.setMemRefs(MF, {WideMMO});
2398       Observer.changedInstr(MI);
2399       return true;
2400     }
2401 
2402     // Don't bother handling edge case that should probably never be produced.
2403     if (ValSize > WideMemSize)
2404       return false;
2405 
2406     LLT WideTy = widenToNextPowerOf2(ValTy);
2407 
2408     Register WideLoad;
2409     if (!WideTy.isVector()) {
2410       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2411       B.buildTrunc(ValReg, WideLoad).getReg(0);
2412     } else {
2413       // Extract the subvector.
2414 
2415       if (isRegisterType(ValTy)) {
2416         // If this a case where G_EXTRACT is legal, use it.
2417         // (e.g. <3 x s32> -> <4 x s32>)
2418         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2419         B.buildExtract(ValReg, WideLoad, 0);
2420       } else {
2421         // For cases where the widened type isn't a nice register value, unmerge
2422         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2423         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2424         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2425         B.setInsertPt(B.getMBB(), MI.getIterator());
2426         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2427       }
2428     }
2429 
2430     MI.eraseFromParent();
2431     return true;
2432   }
2433 
2434   return false;
2435 }
2436 
2437 bool AMDGPULegalizerInfo::legalizeFMad(
2438   MachineInstr &MI, MachineRegisterInfo &MRI,
2439   MachineIRBuilder &B) const {
2440   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2441   assert(Ty.isScalar());
2442 
2443   MachineFunction &MF = B.getMF();
2444   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2445 
2446   // TODO: Always legal with future ftz flag.
2447   // FIXME: Do we need just output?
2448   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2449     return true;
2450   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2451     return true;
2452 
2453   MachineIRBuilder HelperBuilder(MI);
2454   GISelObserverWrapper DummyObserver;
2455   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2456   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2457 }
2458 
2459 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2460   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2461   Register DstReg = MI.getOperand(0).getReg();
2462   Register PtrReg = MI.getOperand(1).getReg();
2463   Register CmpVal = MI.getOperand(2).getReg();
2464   Register NewVal = MI.getOperand(3).getReg();
2465 
2466   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2467          "this should not have been custom lowered");
2468 
2469   LLT ValTy = MRI.getType(CmpVal);
2470   LLT VecTy = LLT::vector(2, ValTy);
2471 
2472   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2473 
2474   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2475     .addDef(DstReg)
2476     .addUse(PtrReg)
2477     .addUse(PackedVal)
2478     .setMemRefs(MI.memoperands());
2479 
2480   MI.eraseFromParent();
2481   return true;
2482 }
2483 
2484 bool AMDGPULegalizerInfo::legalizeFlog(
2485   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2486   Register Dst = MI.getOperand(0).getReg();
2487   Register Src = MI.getOperand(1).getReg();
2488   LLT Ty = B.getMRI()->getType(Dst);
2489   unsigned Flags = MI.getFlags();
2490 
2491   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2492   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2493 
2494   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2495   MI.eraseFromParent();
2496   return true;
2497 }
2498 
2499 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2500                                        MachineIRBuilder &B) const {
2501   Register Dst = MI.getOperand(0).getReg();
2502   Register Src = MI.getOperand(1).getReg();
2503   unsigned Flags = MI.getFlags();
2504   LLT Ty = B.getMRI()->getType(Dst);
2505 
2506   auto K = B.buildFConstant(Ty, numbers::log2e);
2507   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2508   B.buildFExp2(Dst, Mul, Flags);
2509   MI.eraseFromParent();
2510   return true;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2514                                        MachineIRBuilder &B) const {
2515   Register Dst = MI.getOperand(0).getReg();
2516   Register Src0 = MI.getOperand(1).getReg();
2517   Register Src1 = MI.getOperand(2).getReg();
2518   unsigned Flags = MI.getFlags();
2519   LLT Ty = B.getMRI()->getType(Dst);
2520   const LLT S16 = LLT::scalar(16);
2521   const LLT S32 = LLT::scalar(32);
2522 
2523   if (Ty == S32) {
2524     auto Log = B.buildFLog2(S32, Src0, Flags);
2525     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2526       .addUse(Log.getReg(0))
2527       .addUse(Src1)
2528       .setMIFlags(Flags);
2529     B.buildFExp2(Dst, Mul, Flags);
2530   } else if (Ty == S16) {
2531     // There's no f16 fmul_legacy, so we need to convert for it.
2532     auto Log = B.buildFLog2(S16, Src0, Flags);
2533     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2534     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2535     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2536       .addUse(Ext0.getReg(0))
2537       .addUse(Ext1.getReg(0))
2538       .setMIFlags(Flags);
2539 
2540     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2541   } else
2542     return false;
2543 
2544   MI.eraseFromParent();
2545   return true;
2546 }
2547 
2548 // Find a source register, ignoring any possible source modifiers.
2549 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2550   Register ModSrc = OrigSrc;
2551   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2552     ModSrc = SrcFNeg->getOperand(1).getReg();
2553     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2554       ModSrc = SrcFAbs->getOperand(1).getReg();
2555   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2556     ModSrc = SrcFAbs->getOperand(1).getReg();
2557   return ModSrc;
2558 }
2559 
2560 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2561                                          MachineRegisterInfo &MRI,
2562                                          MachineIRBuilder &B) const {
2563 
2564   const LLT S1 = LLT::scalar(1);
2565   const LLT S64 = LLT::scalar(64);
2566   Register Dst = MI.getOperand(0).getReg();
2567   Register OrigSrc = MI.getOperand(1).getReg();
2568   unsigned Flags = MI.getFlags();
2569   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2570          "this should not have been custom lowered");
2571 
2572   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2573   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2574   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2575   // V_FRACT bug is:
2576   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2577   //
2578   // Convert floor(x) to (x - fract(x))
2579 
2580   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2581     .addUse(OrigSrc)
2582     .setMIFlags(Flags);
2583 
2584   // Give source modifier matching some assistance before obscuring a foldable
2585   // pattern.
2586 
2587   // TODO: We can avoid the neg on the fract? The input sign to fract
2588   // shouldn't matter?
2589   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2590 
2591   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2592 
2593   Register Min = MRI.createGenericVirtualRegister(S64);
2594 
2595   // We don't need to concern ourselves with the snan handling difference, so
2596   // use the one which will directly select.
2597   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2598   if (MFI->getMode().IEEE)
2599     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2600   else
2601     B.buildFMinNum(Min, Fract, Const, Flags);
2602 
2603   Register CorrectedFract = Min;
2604   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2605     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2606     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2607   }
2608 
2609   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2610   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2611 
2612   MI.eraseFromParent();
2613   return true;
2614 }
2615 
2616 // Turn an illegal packed v2s16 build vector into bit operations.
2617 // TODO: This should probably be a bitcast action in LegalizerHelper.
2618 bool AMDGPULegalizerInfo::legalizeBuildVector(
2619   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2620   Register Dst = MI.getOperand(0).getReg();
2621   const LLT S32 = LLT::scalar(32);
2622   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2623 
2624   Register Src0 = MI.getOperand(1).getReg();
2625   Register Src1 = MI.getOperand(2).getReg();
2626   assert(MRI.getType(Src0) == LLT::scalar(16));
2627 
2628   auto Merge = B.buildMerge(S32, {Src0, Src1});
2629   B.buildBitcast(Dst, Merge);
2630 
2631   MI.eraseFromParent();
2632   return true;
2633 }
2634 
2635 // Return the use branch instruction, otherwise null if the usage is invalid.
2636 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2637                                        MachineRegisterInfo &MRI,
2638                                        MachineInstr *&Br,
2639                                        MachineBasicBlock *&UncondBrTarget) {
2640   Register CondDef = MI.getOperand(0).getReg();
2641   if (!MRI.hasOneNonDBGUse(CondDef))
2642     return nullptr;
2643 
2644   MachineBasicBlock *Parent = MI.getParent();
2645   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2646   if (UseMI.getParent() != Parent ||
2647       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2648     return nullptr;
2649 
2650   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2651   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2652   if (Next == Parent->end()) {
2653     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2654     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2655       return nullptr;
2656     UncondBrTarget = &*NextMBB;
2657   } else {
2658     if (Next->getOpcode() != AMDGPU::G_BR)
2659       return nullptr;
2660     Br = &*Next;
2661     UncondBrTarget = Br->getOperand(0).getMBB();
2662   }
2663 
2664   return &UseMI;
2665 }
2666 
2667 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2668                                          const ArgDescriptor *Arg,
2669                                          const TargetRegisterClass *ArgRC,
2670                                          LLT ArgTy) const {
2671   MCRegister SrcReg = Arg->getRegister();
2672   assert(SrcReg.isPhysical() && "Physical register expected");
2673   assert(DstReg.isVirtual() && "Virtual register expected");
2674 
2675   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2676                                              ArgTy);
2677   if (Arg->isMasked()) {
2678     // TODO: Should we try to emit this once in the entry block?
2679     const LLT S32 = LLT::scalar(32);
2680     const unsigned Mask = Arg->getMask();
2681     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2682 
2683     Register AndMaskSrc = LiveIn;
2684 
2685     if (Shift != 0) {
2686       auto ShiftAmt = B.buildConstant(S32, Shift);
2687       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2688     }
2689 
2690     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2691   } else {
2692     B.buildCopy(DstReg, LiveIn);
2693   }
2694 
2695   return true;
2696 }
2697 
2698 bool AMDGPULegalizerInfo::loadInputValue(
2699     Register DstReg, MachineIRBuilder &B,
2700     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2701   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2702   const ArgDescriptor *Arg;
2703   const TargetRegisterClass *ArgRC;
2704   LLT ArgTy;
2705   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2706 
2707   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2708     return false; // TODO: Handle these
2709   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2713     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2714     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2715   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2716     return false;
2717 
2718   MI.eraseFromParent();
2719   return true;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2723                                        MachineRegisterInfo &MRI,
2724                                        MachineIRBuilder &B) const {
2725   Register Dst = MI.getOperand(0).getReg();
2726   LLT DstTy = MRI.getType(Dst);
2727   LLT S16 = LLT::scalar(16);
2728   LLT S32 = LLT::scalar(32);
2729   LLT S64 = LLT::scalar(64);
2730 
2731   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2732     return true;
2733 
2734   if (DstTy == S16)
2735     return legalizeFDIV16(MI, MRI, B);
2736   if (DstTy == S32)
2737     return legalizeFDIV32(MI, MRI, B);
2738   if (DstTy == S64)
2739     return legalizeFDIV64(MI, MRI, B);
2740 
2741   return false;
2742 }
2743 
2744 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2745                                                   Register DstReg,
2746                                                   Register X,
2747                                                   Register Y,
2748                                                   bool IsDiv) const {
2749   const LLT S1 = LLT::scalar(1);
2750   const LLT S32 = LLT::scalar(32);
2751 
2752   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2753   // algorithm used here.
2754 
2755   // Initial estimate of inv(y).
2756   auto FloatY = B.buildUITOFP(S32, Y);
2757   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2758   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2759   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2760   auto Z = B.buildFPTOUI(S32, ScaledY);
2761 
2762   // One round of UNR.
2763   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2764   auto NegYZ = B.buildMul(S32, NegY, Z);
2765   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2766 
2767   // Quotient/remainder estimate.
2768   auto Q = B.buildUMulH(S32, X, Z);
2769   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2770 
2771   // First quotient/remainder refinement.
2772   auto One = B.buildConstant(S32, 1);
2773   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2774   if (IsDiv)
2775     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2776   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2777 
2778   // Second quotient/remainder refinement.
2779   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2780   if (IsDiv)
2781     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2782   else
2783     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2784 }
2785 
2786 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2787                                               MachineRegisterInfo &MRI,
2788                                               MachineIRBuilder &B) const {
2789   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2790   Register DstReg = MI.getOperand(0).getReg();
2791   Register Num = MI.getOperand(1).getReg();
2792   Register Den = MI.getOperand(2).getReg();
2793   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2794   MI.eraseFromParent();
2795   return true;
2796 }
2797 
2798 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2799 //
2800 // Return lo, hi of result
2801 //
2802 // %cvt.lo = G_UITOFP Val.lo
2803 // %cvt.hi = G_UITOFP Val.hi
2804 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2805 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2806 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2807 // %mul2 = G_FMUL %mul1, 2**(-32)
2808 // %trunc = G_INTRINSIC_TRUNC %mul2
2809 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2810 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2811 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2812                                                        Register Val) {
2813   const LLT S32 = LLT::scalar(32);
2814   auto Unmerge = B.buildUnmerge(S32, Val);
2815 
2816   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2817   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2818 
2819   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2820                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2821 
2822   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2823   auto Mul1 =
2824       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2825 
2826   // 2**(-32)
2827   auto Mul2 =
2828       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2829   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2830 
2831   // -(2**32)
2832   auto Mad2 = B.buildFMAD(S32, Trunc,
2833                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2834 
2835   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2836   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2837 
2838   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2839 }
2840 
2841 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2842                                                   Register DstReg,
2843                                                   Register Numer,
2844                                                   Register Denom,
2845                                                   bool IsDiv) const {
2846   const LLT S32 = LLT::scalar(32);
2847   const LLT S64 = LLT::scalar(64);
2848   const LLT S1 = LLT::scalar(1);
2849   Register RcpLo, RcpHi;
2850 
2851   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2852 
2853   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2854 
2855   auto Zero64 = B.buildConstant(S64, 0);
2856   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2857 
2858   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2859   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2860 
2861   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2862   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2863   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2864 
2865   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2866   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2867   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2868   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2869 
2870   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2871   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2872   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2873   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2874   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2875 
2876   auto Zero32 = B.buildConstant(S32, 0);
2877   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2878   auto Add2_HiC =
2879       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2880   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2881   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2882 
2883   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2884   Register NumerLo = UnmergeNumer.getReg(0);
2885   Register NumerHi = UnmergeNumer.getReg(1);
2886 
2887   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2888   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2889   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2890   Register Mul3_Lo = UnmergeMul3.getReg(0);
2891   Register Mul3_Hi = UnmergeMul3.getReg(1);
2892   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2893   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2894   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2895   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2896 
2897   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2898   Register DenomLo = UnmergeDenom.getReg(0);
2899   Register DenomHi = UnmergeDenom.getReg(1);
2900 
2901   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2902   auto C1 = B.buildSExt(S32, CmpHi);
2903 
2904   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2905   auto C2 = B.buildSExt(S32, CmpLo);
2906 
2907   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2908   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2909 
2910   // TODO: Here and below portions of the code can be enclosed into if/endif.
2911   // Currently control flow is unconditional and we have 4 selects after
2912   // potential endif to substitute PHIs.
2913 
2914   // if C3 != 0 ...
2915   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2916   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2917   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2918   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2919 
2920   auto One64 = B.buildConstant(S64, 1);
2921   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2922 
2923   auto C4 =
2924       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2925   auto C5 =
2926       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2927   auto C6 = B.buildSelect(
2928       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2929 
2930   // if (C6 != 0)
2931   auto Add4 = B.buildAdd(S64, Add3, One64);
2932   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2933 
2934   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2935   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2936   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2937 
2938   // endif C6
2939   // endif C3
2940 
2941   if (IsDiv) {
2942     auto Sel1 = B.buildSelect(
2943         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2944     B.buildSelect(DstReg,
2945                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2946   } else {
2947     auto Sel2 = B.buildSelect(
2948         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2949     B.buildSelect(DstReg,
2950                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2951   }
2952 }
2953 
2954 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2955                                             MachineRegisterInfo &MRI,
2956                                             MachineIRBuilder &B) const {
2957   const LLT S64 = LLT::scalar(64);
2958   const LLT S32 = LLT::scalar(32);
2959   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2960   Register DstReg = MI.getOperand(0).getReg();
2961   Register Num = MI.getOperand(1).getReg();
2962   Register Den = MI.getOperand(2).getReg();
2963   LLT Ty = MRI.getType(DstReg);
2964 
2965   if (Ty == S32)
2966     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2967   else if (Ty == S64)
2968     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2969   else
2970     return false;
2971 
2972   MI.eraseFromParent();
2973   return true;
2974 
2975 }
2976 
2977 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2978                                             MachineRegisterInfo &MRI,
2979                                             MachineIRBuilder &B) const {
2980   const LLT S64 = LLT::scalar(64);
2981   const LLT S32 = LLT::scalar(32);
2982 
2983   Register DstReg = MI.getOperand(0).getReg();
2984   const LLT Ty = MRI.getType(DstReg);
2985   if (Ty != S32 && Ty != S64)
2986     return false;
2987 
2988   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2989 
2990   Register LHS = MI.getOperand(1).getReg();
2991   Register RHS = MI.getOperand(2).getReg();
2992 
2993   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2994   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2995   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2996 
2997   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2998   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2999 
3000   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3001   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3002 
3003   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3004   if (Ty == S32)
3005     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3006   else
3007     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3008 
3009   Register Sign;
3010   if (IsDiv)
3011     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3012   else
3013     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3014 
3015   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3016   B.buildSub(DstReg, UDivRem, Sign);
3017 
3018   MI.eraseFromParent();
3019   return true;
3020 }
3021 
3022 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3023                                                  MachineRegisterInfo &MRI,
3024                                                  MachineIRBuilder &B) const {
3025   Register Res = MI.getOperand(0).getReg();
3026   Register LHS = MI.getOperand(1).getReg();
3027   Register RHS = MI.getOperand(2).getReg();
3028 
3029   uint16_t Flags = MI.getFlags();
3030 
3031   LLT ResTy = MRI.getType(Res);
3032   LLT S32 = LLT::scalar(32);
3033   LLT S64 = LLT::scalar(64);
3034 
3035   const MachineFunction &MF = B.getMF();
3036   bool Unsafe =
3037     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
3038 
3039   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
3040     return false;
3041 
3042   if (!Unsafe && ResTy == S32 &&
3043       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
3044     return false;
3045 
3046   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3047     // 1 / x -> RCP(x)
3048     if (CLHS->isExactlyValue(1.0)) {
3049       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3050         .addUse(RHS)
3051         .setMIFlags(Flags);
3052 
3053       MI.eraseFromParent();
3054       return true;
3055     }
3056 
3057     // -1 / x -> RCP( FNEG(x) )
3058     if (CLHS->isExactlyValue(-1.0)) {
3059       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3060       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3061         .addUse(FNeg.getReg(0))
3062         .setMIFlags(Flags);
3063 
3064       MI.eraseFromParent();
3065       return true;
3066     }
3067   }
3068 
3069   // x / y -> x * (1.0 / y)
3070   if (Unsafe) {
3071     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3072       .addUse(RHS)
3073       .setMIFlags(Flags);
3074     B.buildFMul(Res, LHS, RCP, Flags);
3075 
3076     MI.eraseFromParent();
3077     return true;
3078   }
3079 
3080   return false;
3081 }
3082 
3083 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3084                                          MachineRegisterInfo &MRI,
3085                                          MachineIRBuilder &B) const {
3086   Register Res = MI.getOperand(0).getReg();
3087   Register LHS = MI.getOperand(1).getReg();
3088   Register RHS = MI.getOperand(2).getReg();
3089 
3090   uint16_t Flags = MI.getFlags();
3091 
3092   LLT S16 = LLT::scalar(16);
3093   LLT S32 = LLT::scalar(32);
3094 
3095   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3096   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3097 
3098   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3099     .addUse(RHSExt.getReg(0))
3100     .setMIFlags(Flags);
3101 
3102   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3103   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3104 
3105   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3106     .addUse(RDst.getReg(0))
3107     .addUse(RHS)
3108     .addUse(LHS)
3109     .setMIFlags(Flags);
3110 
3111   MI.eraseFromParent();
3112   return true;
3113 }
3114 
3115 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3116 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3117 static void toggleSPDenormMode(bool Enable,
3118                                MachineIRBuilder &B,
3119                                const GCNSubtarget &ST,
3120                                AMDGPU::SIModeRegisterDefaults Mode) {
3121   // Set SP denorm mode to this value.
3122   unsigned SPDenormMode =
3123     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3124 
3125   if (ST.hasDenormModeInst()) {
3126     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3127     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3128 
3129     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3130     B.buildInstr(AMDGPU::S_DENORM_MODE)
3131       .addImm(NewDenormModeValue);
3132 
3133   } else {
3134     // Select FP32 bit field in mode register.
3135     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3136                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3137                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3138 
3139     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3140       .addImm(SPDenormMode)
3141       .addImm(SPDenormModeBitField);
3142   }
3143 }
3144 
3145 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3146                                          MachineRegisterInfo &MRI,
3147                                          MachineIRBuilder &B) const {
3148   Register Res = MI.getOperand(0).getReg();
3149   Register LHS = MI.getOperand(1).getReg();
3150   Register RHS = MI.getOperand(2).getReg();
3151   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3152   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3153 
3154   uint16_t Flags = MI.getFlags();
3155 
3156   LLT S32 = LLT::scalar(32);
3157   LLT S1 = LLT::scalar(1);
3158 
3159   auto One = B.buildFConstant(S32, 1.0f);
3160 
3161   auto DenominatorScaled =
3162     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3163       .addUse(LHS)
3164       .addUse(RHS)
3165       .addImm(0)
3166       .setMIFlags(Flags);
3167   auto NumeratorScaled =
3168     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3169       .addUse(LHS)
3170       .addUse(RHS)
3171       .addImm(1)
3172       .setMIFlags(Flags);
3173 
3174   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3175     .addUse(DenominatorScaled.getReg(0))
3176     .setMIFlags(Flags);
3177   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3178 
3179   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3180   // aren't modeled as reading it.
3181   if (!Mode.allFP32Denormals())
3182     toggleSPDenormMode(true, B, ST, Mode);
3183 
3184   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3185   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3186   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3187   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3188   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3189   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3190 
3191   if (!Mode.allFP32Denormals())
3192     toggleSPDenormMode(false, B, ST, Mode);
3193 
3194   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3195     .addUse(Fma4.getReg(0))
3196     .addUse(Fma1.getReg(0))
3197     .addUse(Fma3.getReg(0))
3198     .addUse(NumeratorScaled.getReg(1))
3199     .setMIFlags(Flags);
3200 
3201   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3202     .addUse(Fmas.getReg(0))
3203     .addUse(RHS)
3204     .addUse(LHS)
3205     .setMIFlags(Flags);
3206 
3207   MI.eraseFromParent();
3208   return true;
3209 }
3210 
3211 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3212                                          MachineRegisterInfo &MRI,
3213                                          MachineIRBuilder &B) const {
3214   Register Res = MI.getOperand(0).getReg();
3215   Register LHS = MI.getOperand(1).getReg();
3216   Register RHS = MI.getOperand(2).getReg();
3217 
3218   uint16_t Flags = MI.getFlags();
3219 
3220   LLT S64 = LLT::scalar(64);
3221   LLT S1 = LLT::scalar(1);
3222 
3223   auto One = B.buildFConstant(S64, 1.0);
3224 
3225   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3226     .addUse(LHS)
3227     .addUse(RHS)
3228     .addImm(0)
3229     .setMIFlags(Flags);
3230 
3231   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3232 
3233   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3234     .addUse(DivScale0.getReg(0))
3235     .setMIFlags(Flags);
3236 
3237   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3238   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3239   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3240 
3241   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3242     .addUse(LHS)
3243     .addUse(RHS)
3244     .addImm(1)
3245     .setMIFlags(Flags);
3246 
3247   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3248   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3249   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3250 
3251   Register Scale;
3252   if (!ST.hasUsableDivScaleConditionOutput()) {
3253     // Workaround a hardware bug on SI where the condition output from div_scale
3254     // is not usable.
3255 
3256     LLT S32 = LLT::scalar(32);
3257 
3258     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3259     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3260     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3261     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3262 
3263     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3264                               Scale1Unmerge.getReg(1));
3265     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3266                               Scale0Unmerge.getReg(1));
3267     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3268   } else {
3269     Scale = DivScale1.getReg(1);
3270   }
3271 
3272   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3273     .addUse(Fma4.getReg(0))
3274     .addUse(Fma3.getReg(0))
3275     .addUse(Mul.getReg(0))
3276     .addUse(Scale)
3277     .setMIFlags(Flags);
3278 
3279   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3280     .addUse(Fmas.getReg(0))
3281     .addUse(RHS)
3282     .addUse(LHS)
3283     .setMIFlags(Flags);
3284 
3285   MI.eraseFromParent();
3286   return true;
3287 }
3288 
3289 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3290                                                  MachineRegisterInfo &MRI,
3291                                                  MachineIRBuilder &B) const {
3292   Register Res = MI.getOperand(0).getReg();
3293   Register LHS = MI.getOperand(2).getReg();
3294   Register RHS = MI.getOperand(3).getReg();
3295   uint16_t Flags = MI.getFlags();
3296 
3297   LLT S32 = LLT::scalar(32);
3298   LLT S1 = LLT::scalar(1);
3299 
3300   auto Abs = B.buildFAbs(S32, RHS, Flags);
3301   const APFloat C0Val(1.0f);
3302 
3303   auto C0 = B.buildConstant(S32, 0x6f800000);
3304   auto C1 = B.buildConstant(S32, 0x2f800000);
3305   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3306 
3307   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3308   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3309 
3310   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3311 
3312   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3313     .addUse(Mul0.getReg(0))
3314     .setMIFlags(Flags);
3315 
3316   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3317 
3318   B.buildFMul(Res, Sel, Mul1, Flags);
3319 
3320   MI.eraseFromParent();
3321   return true;
3322 }
3323 
3324 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3325 // FIXME: Why do we handle this one but not other removed instructions?
3326 //
3327 // Reciprocal square root.  The clamp prevents infinite results, clamping
3328 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3329 // +-max_float.
3330 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3331                                                     MachineRegisterInfo &MRI,
3332                                                     MachineIRBuilder &B) const {
3333   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3334     return true;
3335 
3336   Register Dst = MI.getOperand(0).getReg();
3337   Register Src = MI.getOperand(2).getReg();
3338   auto Flags = MI.getFlags();
3339 
3340   LLT Ty = MRI.getType(Dst);
3341 
3342   const fltSemantics *FltSemantics;
3343   if (Ty == LLT::scalar(32))
3344     FltSemantics = &APFloat::IEEEsingle();
3345   else if (Ty == LLT::scalar(64))
3346     FltSemantics = &APFloat::IEEEdouble();
3347   else
3348     return false;
3349 
3350   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3351     .addUse(Src)
3352     .setMIFlags(Flags);
3353 
3354   // We don't need to concern ourselves with the snan handling difference, since
3355   // the rsq quieted (or not) so use the one which will directly select.
3356   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3357   const bool UseIEEE = MFI->getMode().IEEE;
3358 
3359   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3360   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3361                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3362 
3363   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3364 
3365   if (UseIEEE)
3366     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3367   else
3368     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3369   MI.eraseFromParent();
3370   return true;
3371 }
3372 
3373 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3374   switch (IID) {
3375   case Intrinsic::amdgcn_ds_fadd:
3376     return AMDGPU::G_ATOMICRMW_FADD;
3377   case Intrinsic::amdgcn_ds_fmin:
3378     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3379   case Intrinsic::amdgcn_ds_fmax:
3380     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3381   default:
3382     llvm_unreachable("not a DS FP intrinsic");
3383   }
3384 }
3385 
3386 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3387                                                       MachineInstr &MI,
3388                                                       Intrinsic::ID IID) const {
3389   GISelChangeObserver &Observer = Helper.Observer;
3390   Observer.changingInstr(MI);
3391 
3392   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3393 
3394   // The remaining operands were used to set fields in the MemOperand on
3395   // construction.
3396   for (int I = 6; I > 3; --I)
3397     MI.RemoveOperand(I);
3398 
3399   MI.RemoveOperand(1); // Remove the intrinsic ID.
3400   Observer.changedInstr(MI);
3401   return true;
3402 }
3403 
3404 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3405                                             MachineRegisterInfo &MRI,
3406                                             MachineIRBuilder &B) const {
3407   uint64_t Offset =
3408     ST.getTargetLowering()->getImplicitParameterOffset(
3409       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3410   LLT DstTy = MRI.getType(DstReg);
3411   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3412 
3413   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3414   if (!loadInputValue(KernargPtrReg, B,
3415                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3416     return false;
3417 
3418   // FIXME: This should be nuw
3419   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3420   return true;
3421 }
3422 
3423 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3424                                                  MachineRegisterInfo &MRI,
3425                                                  MachineIRBuilder &B) const {
3426   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3427   if (!MFI->isEntryFunction()) {
3428     return legalizePreloadedArgIntrin(MI, MRI, B,
3429                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3430   }
3431 
3432   Register DstReg = MI.getOperand(0).getReg();
3433   if (!getImplicitArgPtr(DstReg, MRI, B))
3434     return false;
3435 
3436   MI.eraseFromParent();
3437   return true;
3438 }
3439 
3440 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3441                                               MachineRegisterInfo &MRI,
3442                                               MachineIRBuilder &B,
3443                                               unsigned AddrSpace) const {
3444   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3445   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3446   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3447   MI.eraseFromParent();
3448   return true;
3449 }
3450 
3451 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3452 // offset (the offset that is included in bounds checking and swizzling, to be
3453 // split between the instruction's voffset and immoffset fields) and soffset
3454 // (the offset that is excluded from bounds checking and swizzling, to go in
3455 // the instruction's soffset field).  This function takes the first kind of
3456 // offset and figures out how to split it between voffset and immoffset.
3457 std::tuple<Register, unsigned, unsigned>
3458 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3459                                         Register OrigOffset) const {
3460   const unsigned MaxImm = 4095;
3461   Register BaseReg;
3462   unsigned TotalConstOffset;
3463   MachineInstr *OffsetDef;
3464   const LLT S32 = LLT::scalar(32);
3465 
3466   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3467     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3468 
3469   unsigned ImmOffset = TotalConstOffset;
3470 
3471   // If the immediate value is too big for the immoffset field, put the value
3472   // and -4096 into the immoffset field so that the value that is copied/added
3473   // for the voffset field is a multiple of 4096, and it stands more chance
3474   // of being CSEd with the copy/add for another similar load/store.
3475   // However, do not do that rounding down to a multiple of 4096 if that is a
3476   // negative number, as it appears to be illegal to have a negative offset
3477   // in the vgpr, even if adding the immediate offset makes it positive.
3478   unsigned Overflow = ImmOffset & ~MaxImm;
3479   ImmOffset -= Overflow;
3480   if ((int32_t)Overflow < 0) {
3481     Overflow += ImmOffset;
3482     ImmOffset = 0;
3483   }
3484 
3485   if (Overflow != 0) {
3486     if (!BaseReg) {
3487       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3488     } else {
3489       auto OverflowVal = B.buildConstant(S32, Overflow);
3490       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3491     }
3492   }
3493 
3494   if (!BaseReg)
3495     BaseReg = B.buildConstant(S32, 0).getReg(0);
3496 
3497   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3498 }
3499 
3500 /// Handle register layout difference for f16 images for some subtargets.
3501 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3502                                              MachineRegisterInfo &MRI,
3503                                              Register Reg) const {
3504   if (!ST.hasUnpackedD16VMem())
3505     return Reg;
3506 
3507   const LLT S16 = LLT::scalar(16);
3508   const LLT S32 = LLT::scalar(32);
3509   LLT StoreVT = MRI.getType(Reg);
3510   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3511 
3512   auto Unmerge = B.buildUnmerge(S16, Reg);
3513 
3514   SmallVector<Register, 4> WideRegs;
3515   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3516     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3517 
3518   int NumElts = StoreVT.getNumElements();
3519 
3520   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3521 }
3522 
3523 Register AMDGPULegalizerInfo::fixStoreSourceType(
3524   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3525   MachineRegisterInfo *MRI = B.getMRI();
3526   LLT Ty = MRI->getType(VData);
3527 
3528   const LLT S16 = LLT::scalar(16);
3529 
3530   // Fixup illegal register types for i8 stores.
3531   if (Ty == LLT::scalar(8) || Ty == S16) {
3532     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3533     return AnyExt;
3534   }
3535 
3536   if (Ty.isVector()) {
3537     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3538       if (IsFormat)
3539         return handleD16VData(B, *MRI, VData);
3540     }
3541   }
3542 
3543   return VData;
3544 }
3545 
3546 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3547                                               MachineRegisterInfo &MRI,
3548                                               MachineIRBuilder &B,
3549                                               bool IsTyped,
3550                                               bool IsFormat) const {
3551   Register VData = MI.getOperand(1).getReg();
3552   LLT Ty = MRI.getType(VData);
3553   LLT EltTy = Ty.getScalarType();
3554   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3555   const LLT S32 = LLT::scalar(32);
3556 
3557   VData = fixStoreSourceType(B, VData, IsFormat);
3558   Register RSrc = MI.getOperand(2).getReg();
3559 
3560   MachineMemOperand *MMO = *MI.memoperands_begin();
3561   const int MemSize = MMO->getSize();
3562 
3563   unsigned ImmOffset;
3564   unsigned TotalOffset;
3565 
3566   // The typed intrinsics add an immediate after the registers.
3567   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3568 
3569   // The struct intrinsic variants add one additional operand over raw.
3570   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3571   Register VIndex;
3572   int OpOffset = 0;
3573   if (HasVIndex) {
3574     VIndex = MI.getOperand(3).getReg();
3575     OpOffset = 1;
3576   }
3577 
3578   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3579   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3580 
3581   unsigned Format = 0;
3582   if (IsTyped) {
3583     Format = MI.getOperand(5 + OpOffset).getImm();
3584     ++OpOffset;
3585   }
3586 
3587   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3588 
3589   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3590   if (TotalOffset != 0)
3591     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3592 
3593   unsigned Opc;
3594   if (IsTyped) {
3595     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3596                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3597   } else if (IsFormat) {
3598     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3599                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3600   } else {
3601     switch (MemSize) {
3602     case 1:
3603       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3604       break;
3605     case 2:
3606       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3607       break;
3608     default:
3609       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3610       break;
3611     }
3612   }
3613 
3614   if (!VIndex)
3615     VIndex = B.buildConstant(S32, 0).getReg(0);
3616 
3617   auto MIB = B.buildInstr(Opc)
3618     .addUse(VData)              // vdata
3619     .addUse(RSrc)               // rsrc
3620     .addUse(VIndex)             // vindex
3621     .addUse(VOffset)            // voffset
3622     .addUse(SOffset)            // soffset
3623     .addImm(ImmOffset);         // offset(imm)
3624 
3625   if (IsTyped)
3626     MIB.addImm(Format);
3627 
3628   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3629      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3630      .addMemOperand(MMO);
3631 
3632   MI.eraseFromParent();
3633   return true;
3634 }
3635 
3636 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3637                                              MachineRegisterInfo &MRI,
3638                                              MachineIRBuilder &B,
3639                                              bool IsFormat,
3640                                              bool IsTyped) const {
3641   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3642   MachineMemOperand *MMO = *MI.memoperands_begin();
3643   const int MemSize = MMO->getSize();
3644   const LLT S32 = LLT::scalar(32);
3645 
3646   Register Dst = MI.getOperand(0).getReg();
3647   Register RSrc = MI.getOperand(2).getReg();
3648 
3649   // The typed intrinsics add an immediate after the registers.
3650   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3651 
3652   // The struct intrinsic variants add one additional operand over raw.
3653   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3654   Register VIndex;
3655   int OpOffset = 0;
3656   if (HasVIndex) {
3657     VIndex = MI.getOperand(3).getReg();
3658     OpOffset = 1;
3659   }
3660 
3661   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3662   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3663 
3664   unsigned Format = 0;
3665   if (IsTyped) {
3666     Format = MI.getOperand(5 + OpOffset).getImm();
3667     ++OpOffset;
3668   }
3669 
3670   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3671   unsigned ImmOffset;
3672   unsigned TotalOffset;
3673 
3674   LLT Ty = MRI.getType(Dst);
3675   LLT EltTy = Ty.getScalarType();
3676   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3677   const bool Unpacked = ST.hasUnpackedD16VMem();
3678 
3679   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3680   if (TotalOffset != 0)
3681     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3682 
3683   unsigned Opc;
3684 
3685   if (IsTyped) {
3686     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3687                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3688   } else if (IsFormat) {
3689     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3690                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3691   } else {
3692     switch (MemSize) {
3693     case 1:
3694       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3695       break;
3696     case 2:
3697       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3698       break;
3699     default:
3700       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3701       break;
3702     }
3703   }
3704 
3705   Register LoadDstReg;
3706 
3707   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3708   LLT UnpackedTy = Ty.changeElementSize(32);
3709 
3710   if (IsExtLoad)
3711     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3712   else if (Unpacked && IsD16 && Ty.isVector())
3713     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3714   else
3715     LoadDstReg = Dst;
3716 
3717   if (!VIndex)
3718     VIndex = B.buildConstant(S32, 0).getReg(0);
3719 
3720   auto MIB = B.buildInstr(Opc)
3721     .addDef(LoadDstReg)         // vdata
3722     .addUse(RSrc)               // rsrc
3723     .addUse(VIndex)             // vindex
3724     .addUse(VOffset)            // voffset
3725     .addUse(SOffset)            // soffset
3726     .addImm(ImmOffset);         // offset(imm)
3727 
3728   if (IsTyped)
3729     MIB.addImm(Format);
3730 
3731   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3732      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3733      .addMemOperand(MMO);
3734 
3735   if (LoadDstReg != Dst) {
3736     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3737 
3738     // Widen result for extending loads was widened.
3739     if (IsExtLoad)
3740       B.buildTrunc(Dst, LoadDstReg);
3741     else {
3742       // Repack to original 16-bit vector result
3743       // FIXME: G_TRUNC should work, but legalization currently fails
3744       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3745       SmallVector<Register, 4> Repack;
3746       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3747         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3748       B.buildMerge(Dst, Repack);
3749     }
3750   }
3751 
3752   MI.eraseFromParent();
3753   return true;
3754 }
3755 
3756 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3757                                                MachineIRBuilder &B,
3758                                                bool IsInc) const {
3759   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3760                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3761   B.buildInstr(Opc)
3762     .addDef(MI.getOperand(0).getReg())
3763     .addUse(MI.getOperand(2).getReg())
3764     .addUse(MI.getOperand(3).getReg())
3765     .cloneMemRefs(MI);
3766   MI.eraseFromParent();
3767   return true;
3768 }
3769 
3770 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3771   switch (IntrID) {
3772   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3773   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3774     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3775   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3776   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3777     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3778   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3779   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3780     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3781   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3782   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3783     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3784   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3785   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3786     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3787   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3788   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3789     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3790   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3791   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3792     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3793   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3794   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3795     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3796   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3797   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3798     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3799   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3800   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3801     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3802   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3803   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3804     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3805   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3806   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3807     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3808   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3809   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3810     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3811   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3812   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3813     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3814   default:
3815     llvm_unreachable("unhandled atomic opcode");
3816   }
3817 }
3818 
3819 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3820                                                MachineIRBuilder &B,
3821                                                Intrinsic::ID IID) const {
3822   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3823                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3824   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3825 
3826   Register Dst;
3827 
3828   int OpOffset = 0;
3829   if (HasReturn) {
3830     // A few FP atomics do not support return values.
3831     Dst = MI.getOperand(0).getReg();
3832   } else {
3833     OpOffset = -1;
3834   }
3835 
3836   Register VData = MI.getOperand(2 + OpOffset).getReg();
3837   Register CmpVal;
3838 
3839   if (IsCmpSwap) {
3840     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3841     ++OpOffset;
3842   }
3843 
3844   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3845   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3846 
3847   // The struct intrinsic variants add one additional operand over raw.
3848   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3849   Register VIndex;
3850   if (HasVIndex) {
3851     VIndex = MI.getOperand(4 + OpOffset).getReg();
3852     ++OpOffset;
3853   }
3854 
3855   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3856   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3857   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3858 
3859   MachineMemOperand *MMO = *MI.memoperands_begin();
3860 
3861   unsigned ImmOffset;
3862   unsigned TotalOffset;
3863   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3864   if (TotalOffset != 0)
3865     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3866 
3867   if (!VIndex)
3868     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3869 
3870   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3871 
3872   if (HasReturn)
3873     MIB.addDef(Dst);
3874 
3875   MIB.addUse(VData); // vdata
3876 
3877   if (IsCmpSwap)
3878     MIB.addReg(CmpVal);
3879 
3880   MIB.addUse(RSrc)               // rsrc
3881      .addUse(VIndex)             // vindex
3882      .addUse(VOffset)            // voffset
3883      .addUse(SOffset)            // soffset
3884      .addImm(ImmOffset)          // offset(imm)
3885      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3886      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3887      .addMemOperand(MMO);
3888 
3889   MI.eraseFromParent();
3890   return true;
3891 }
3892 
3893 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3894 /// vector with s16 typed elements.
3895 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3896                                         SmallVectorImpl<Register> &PackedAddrs,
3897                                         int AddrIdx, int DimIdx, int EndIdx,
3898                                         int NumGradients) {
3899   const LLT S16 = LLT::scalar(16);
3900   const LLT V2S16 = LLT::vector(2, 16);
3901 
3902   for (int I = AddrIdx; I < EndIdx; ++I) {
3903     MachineOperand &SrcOp = MI.getOperand(I);
3904     if (!SrcOp.isReg())
3905       continue; // _L to _LZ may have eliminated this.
3906 
3907     Register AddrReg = SrcOp.getReg();
3908 
3909     if (I < DimIdx) {
3910       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3911       PackedAddrs.push_back(AddrReg);
3912     } else {
3913       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3914       // derivatives dx/dh and dx/dv are packed with undef.
3915       if (((I + 1) >= EndIdx) ||
3916           ((NumGradients / 2) % 2 == 1 &&
3917            (I == DimIdx + (NumGradients / 2) - 1 ||
3918             I == DimIdx + NumGradients - 1)) ||
3919           // Check for _L to _LZ optimization
3920           !MI.getOperand(I + 1).isReg()) {
3921         PackedAddrs.push_back(
3922             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3923                 .getReg(0));
3924       } else {
3925         PackedAddrs.push_back(
3926             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3927                 .getReg(0));
3928         ++I;
3929       }
3930     }
3931   }
3932 }
3933 
3934 /// Convert from separate vaddr components to a single vector address register,
3935 /// and replace the remaining operands with $noreg.
3936 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3937                                      int DimIdx, int NumVAddrs) {
3938   const LLT S32 = LLT::scalar(32);
3939 
3940   SmallVector<Register, 8> AddrRegs;
3941   for (int I = 0; I != NumVAddrs; ++I) {
3942     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3943     if (SrcOp.isReg()) {
3944       AddrRegs.push_back(SrcOp.getReg());
3945       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3946     }
3947   }
3948 
3949   int NumAddrRegs = AddrRegs.size();
3950   if (NumAddrRegs != 1) {
3951     // Round up to 8 elements for v5-v7
3952     // FIXME: Missing intermediate sized register classes and instructions.
3953     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3954       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3955       auto Undef = B.buildUndef(S32);
3956       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3957       NumAddrRegs = RoundedNumRegs;
3958     }
3959 
3960     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3961     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3962   }
3963 
3964   for (int I = 1; I != NumVAddrs; ++I) {
3965     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3966     if (SrcOp.isReg())
3967       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3968   }
3969 }
3970 
3971 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3972 ///
3973 /// Depending on the subtarget, load/store with 16-bit element data need to be
3974 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3975 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3976 /// registers.
3977 ///
3978 /// We don't want to directly select image instructions just yet, but also want
3979 /// to exposes all register repacking to the legalizer/combiners. We also don't
3980 /// want a selected instrution entering RegBankSelect. In order to avoid
3981 /// defining a multitude of intermediate image instructions, directly hack on
3982 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3983 /// now unnecessary arguments with $noreg.
3984 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3985     MachineInstr &MI, MachineIRBuilder &B,
3986     GISelChangeObserver &Observer,
3987     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3988 
3989   const int NumDefs = MI.getNumExplicitDefs();
3990   bool IsTFE = NumDefs == 2;
3991   // We are only processing the operands of d16 image operations on subtargets
3992   // that use the unpacked register layout, or need to repack the TFE result.
3993 
3994   // TODO: Do we need to guard against already legalized intrinsics?
3995   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3996     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3997 
3998   MachineRegisterInfo *MRI = B.getMRI();
3999   const LLT S32 = LLT::scalar(32);
4000   const LLT S16 = LLT::scalar(16);
4001   const LLT V2S16 = LLT::vector(2, 16);
4002 
4003   // Index of first address argument
4004   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
4005 
4006   int NumVAddrs, NumGradients;
4007   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
4008   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
4009     getDMaskIdx(BaseOpcode, NumDefs);
4010   unsigned DMask = 0;
4011 
4012   // Check for 16 bit addresses and pack if true.
4013   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4014   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
4015   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
4016   const bool IsG16 = GradTy == S16;
4017   const bool IsA16 = AddrTy == S16;
4018 
4019   int DMaskLanes = 0;
4020   if (!BaseOpcode->Atomic) {
4021     DMask = MI.getOperand(DMaskIdx).getImm();
4022     if (BaseOpcode->Gather4) {
4023       DMaskLanes = 4;
4024     } else if (DMask != 0) {
4025       DMaskLanes = countPopulation(DMask);
4026     } else if (!IsTFE && !BaseOpcode->Store) {
4027       // If dmask is 0, this is a no-op load. This can be eliminated.
4028       B.buildUndef(MI.getOperand(0));
4029       MI.eraseFromParent();
4030       return true;
4031     }
4032   }
4033 
4034   Observer.changingInstr(MI);
4035   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4036 
4037   unsigned NewOpcode = NumDefs == 0 ?
4038     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4039 
4040   // Track that we legalized this
4041   MI.setDesc(B.getTII().get(NewOpcode));
4042 
4043   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4044   // dmask to be at least 1 otherwise the instruction will fail
4045   if (IsTFE && DMask == 0) {
4046     DMask = 0x1;
4047     DMaskLanes = 1;
4048     MI.getOperand(DMaskIdx).setImm(DMask);
4049   }
4050 
4051   if (BaseOpcode->Atomic) {
4052     Register VData0 = MI.getOperand(2).getReg();
4053     LLT Ty = MRI->getType(VData0);
4054 
4055     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4056     if (Ty.isVector())
4057       return false;
4058 
4059     if (BaseOpcode->AtomicX2) {
4060       Register VData1 = MI.getOperand(3).getReg();
4061       // The two values are packed in one register.
4062       LLT PackedTy = LLT::vector(2, Ty);
4063       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4064       MI.getOperand(2).setReg(Concat.getReg(0));
4065       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4066     }
4067   }
4068 
4069   int CorrectedNumVAddrs = NumVAddrs;
4070 
4071   // Optimize _L to _LZ when _L is zero
4072   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4073         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
4074     const ConstantFP *ConstantLod;
4075     const int LodIdx = AddrIdx + NumVAddrs - 1;
4076 
4077     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
4078       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4079         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4080         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
4081           LZMappingInfo->LZ, ImageDimIntr->Dim);
4082 
4083         // The starting indexes should remain in the same place.
4084         --NumVAddrs;
4085         --CorrectedNumVAddrs;
4086 
4087         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
4088           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
4089         MI.RemoveOperand(LodIdx);
4090       }
4091     }
4092   }
4093 
4094   // Optimize _mip away, when 'lod' is zero
4095   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
4096     int64_t ConstantLod;
4097     const int LodIdx = AddrIdx + NumVAddrs - 1;
4098 
4099     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
4100       if (ConstantLod == 0) {
4101         // TODO: Change intrinsic opcode and remove operand instead or replacing
4102         // it with 0, as the _L to _LZ handling is done above.
4103         MI.getOperand(LodIdx).ChangeToImmediate(0);
4104         --CorrectedNumVAddrs;
4105       }
4106     }
4107   }
4108 
4109   // Rewrite the addressing register layout before doing anything else.
4110   if (IsA16 || IsG16) {
4111     if (IsA16) {
4112       // Target must support the feature and gradients need to be 16 bit too
4113       if (!ST.hasA16() || !IsG16)
4114         return false;
4115     } else if (!ST.hasG16())
4116       return false;
4117 
4118     if (NumVAddrs > 1) {
4119       SmallVector<Register, 4> PackedRegs;
4120       // Don't compress addresses for G16
4121       const int PackEndIdx =
4122           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4123       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4124                                   PackEndIdx, NumGradients);
4125 
4126       if (!IsA16) {
4127         // Add uncompressed address
4128         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4129           int AddrReg = MI.getOperand(I).getReg();
4130           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4131           PackedRegs.push_back(AddrReg);
4132         }
4133       }
4134 
4135       // See also below in the non-a16 branch
4136       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4137 
4138       if (!UseNSA && PackedRegs.size() > 1) {
4139         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4140         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4141         PackedRegs[0] = Concat.getReg(0);
4142         PackedRegs.resize(1);
4143       }
4144 
4145       const int NumPacked = PackedRegs.size();
4146       for (int I = 0; I != NumVAddrs; ++I) {
4147         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4148         if (!SrcOp.isReg()) {
4149           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4150           continue;
4151         }
4152 
4153         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4154 
4155         if (I < NumPacked)
4156           SrcOp.setReg(PackedRegs[I]);
4157         else
4158           SrcOp.setReg(AMDGPU::NoRegister);
4159       }
4160     }
4161   } else {
4162     // If the register allocator cannot place the address registers contiguously
4163     // without introducing moves, then using the non-sequential address encoding
4164     // is always preferable, since it saves VALU instructions and is usually a
4165     // wash in terms of code size or even better.
4166     //
4167     // However, we currently have no way of hinting to the register allocator
4168     // that MIMG addresses should be placed contiguously when it is possible to
4169     // do so, so force non-NSA for the common 2-address case as a heuristic.
4170     //
4171     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4172     // allocation when possible.
4173     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4174 
4175     if (!UseNSA && NumVAddrs > 1)
4176       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4177   }
4178 
4179   int Flags = 0;
4180   if (IsA16)
4181     Flags |= 1;
4182   if (IsG16)
4183     Flags |= 2;
4184   MI.addOperand(MachineOperand::CreateImm(Flags));
4185 
4186   if (BaseOpcode->Store) { // No TFE for stores?
4187     // TODO: Handle dmask trim
4188     Register VData = MI.getOperand(1).getReg();
4189     LLT Ty = MRI->getType(VData);
4190     if (!Ty.isVector() || Ty.getElementType() != S16)
4191       return true;
4192 
4193     Register RepackedReg = handleD16VData(B, *MRI, VData);
4194     if (RepackedReg != VData) {
4195       MI.getOperand(1).setReg(RepackedReg);
4196     }
4197 
4198     return true;
4199   }
4200 
4201   Register DstReg = MI.getOperand(0).getReg();
4202   LLT Ty = MRI->getType(DstReg);
4203   const LLT EltTy = Ty.getScalarType();
4204   const bool IsD16 = Ty.getScalarType() == S16;
4205   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4206 
4207   // Confirm that the return type is large enough for the dmask specified
4208   if (NumElts < DMaskLanes)
4209     return false;
4210 
4211   if (NumElts > 4 || DMaskLanes > 4)
4212     return false;
4213 
4214   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4215   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4216 
4217   // The raw dword aligned data component of the load. The only legal cases
4218   // where this matters should be when using the packed D16 format, for
4219   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4220   LLT RoundedTy;
4221 
4222   // S32 vector to to cover all data, plus TFE result element.
4223   LLT TFETy;
4224 
4225   // Register type to use for each loaded component. Will be S32 or V2S16.
4226   LLT RegTy;
4227 
4228   if (IsD16 && ST.hasUnpackedD16VMem()) {
4229     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4230     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4231     RegTy = S32;
4232   } else {
4233     unsigned EltSize = EltTy.getSizeInBits();
4234     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4235     unsigned RoundedSize = 32 * RoundedElts;
4236     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4237     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4238     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4239   }
4240 
4241   // The return type does not need adjustment.
4242   // TODO: Should we change s16 case to s32 or <2 x s16>?
4243   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4244     return true;
4245 
4246   Register Dst1Reg;
4247 
4248   // Insert after the instruction.
4249   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4250 
4251   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4252   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4253   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4254   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4255 
4256   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4257 
4258   MI.getOperand(0).setReg(NewResultReg);
4259 
4260   // In the IR, TFE is supposed to be used with a 2 element struct return
4261   // type. The intruction really returns these two values in one contiguous
4262   // register, with one additional dword beyond the loaded data. Rewrite the
4263   // return type to use a single register result.
4264 
4265   if (IsTFE) {
4266     Dst1Reg = MI.getOperand(1).getReg();
4267     if (MRI->getType(Dst1Reg) != S32)
4268       return false;
4269 
4270     // TODO: Make sure the TFE operand bit is set.
4271     MI.RemoveOperand(1);
4272 
4273     // Handle the easy case that requires no repack instructions.
4274     if (Ty == S32) {
4275       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4276       return true;
4277     }
4278   }
4279 
4280   // Now figure out how to copy the new result register back into the old
4281   // result.
4282   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4283 
4284   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4285 
4286   if (ResultNumRegs == 1) {
4287     assert(!IsTFE);
4288     ResultRegs[0] = NewResultReg;
4289   } else {
4290     // We have to repack into a new vector of some kind.
4291     for (int I = 0; I != NumDataRegs; ++I)
4292       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4293     B.buildUnmerge(ResultRegs, NewResultReg);
4294 
4295     // Drop the final TFE element to get the data part. The TFE result is
4296     // directly written to the right place already.
4297     if (IsTFE)
4298       ResultRegs.resize(NumDataRegs);
4299   }
4300 
4301   // For an s16 scalar result, we form an s32 result with a truncate regardless
4302   // of packed vs. unpacked.
4303   if (IsD16 && !Ty.isVector()) {
4304     B.buildTrunc(DstReg, ResultRegs[0]);
4305     return true;
4306   }
4307 
4308   // Avoid a build/concat_vector of 1 entry.
4309   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4310     B.buildBitcast(DstReg, ResultRegs[0]);
4311     return true;
4312   }
4313 
4314   assert(Ty.isVector());
4315 
4316   if (IsD16) {
4317     // For packed D16 results with TFE enabled, all the data components are
4318     // S32. Cast back to the expected type.
4319     //
4320     // TODO: We don't really need to use load s32 elements. We would only need one
4321     // cast for the TFE result if a multiple of v2s16 was used.
4322     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4323       for (Register &Reg : ResultRegs)
4324         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4325     } else if (ST.hasUnpackedD16VMem()) {
4326       for (Register &Reg : ResultRegs)
4327         Reg = B.buildTrunc(S16, Reg).getReg(0);
4328     }
4329   }
4330 
4331   auto padWithUndef = [&](LLT Ty, int NumElts) {
4332     if (NumElts == 0)
4333       return;
4334     Register Undef = B.buildUndef(Ty).getReg(0);
4335     for (int I = 0; I != NumElts; ++I)
4336       ResultRegs.push_back(Undef);
4337   };
4338 
4339   // Pad out any elements eliminated due to the dmask.
4340   LLT ResTy = MRI->getType(ResultRegs[0]);
4341   if (!ResTy.isVector()) {
4342     padWithUndef(ResTy, NumElts - ResultRegs.size());
4343     B.buildBuildVector(DstReg, ResultRegs);
4344     return true;
4345   }
4346 
4347   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4348   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4349 
4350   // Deal with the one annoying legal case.
4351   const LLT V3S16 = LLT::vector(3, 16);
4352   if (Ty == V3S16) {
4353     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4354     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4355     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4356     return true;
4357   }
4358 
4359   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4360   B.buildConcatVectors(DstReg, ResultRegs);
4361   return true;
4362 }
4363 
4364 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4365   LegalizerHelper &Helper, MachineInstr &MI) const {
4366   MachineIRBuilder &B = Helper.MIRBuilder;
4367   GISelChangeObserver &Observer = Helper.Observer;
4368 
4369   Register Dst = MI.getOperand(0).getReg();
4370   LLT Ty = B.getMRI()->getType(Dst);
4371   unsigned Size = Ty.getSizeInBits();
4372   MachineFunction &MF = B.getMF();
4373 
4374   Observer.changingInstr(MI);
4375 
4376   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4377     Ty = getBitcastRegisterType(Ty);
4378     Helper.bitcastDst(MI, Ty, 0);
4379     Dst = MI.getOperand(0).getReg();
4380     B.setInsertPt(B.getMBB(), MI);
4381   }
4382 
4383   // FIXME: We don't really need this intermediate instruction. The intrinsic
4384   // should be fixed to have a memory operand. Since it's readnone, we're not
4385   // allowed to add one.
4386   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4387   MI.RemoveOperand(1); // Remove intrinsic ID
4388 
4389   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4390   // TODO: Should this use datalayout alignment?
4391   const unsigned MemSize = (Size + 7) / 8;
4392   const Align MemAlign(4);
4393   MachineMemOperand *MMO = MF.getMachineMemOperand(
4394       MachinePointerInfo(),
4395       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4396           MachineMemOperand::MOInvariant,
4397       MemSize, MemAlign);
4398   MI.addMemOperand(MF, MMO);
4399 
4400   // There are no 96-bit result scalar loads, but widening to 128-bit should
4401   // always be legal. We may need to restore this to a 96-bit result if it turns
4402   // out this needs to be converted to a vector load during RegBankSelect.
4403   if (!isPowerOf2_32(Size)) {
4404     if (Ty.isVector())
4405       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4406     else
4407       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4408   }
4409 
4410   Observer.changedInstr(MI);
4411   return true;
4412 }
4413 
4414 // TODO: Move to selection
4415 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4416                                                 MachineRegisterInfo &MRI,
4417                                                 MachineIRBuilder &B) const {
4418   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4419   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4420       !ST.isTrapHandlerEnabled()) {
4421     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4422   } else {
4423     // Pass queue pointer to trap handler as input, and insert trap instruction
4424     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4425     MachineRegisterInfo &MRI = *B.getMRI();
4426 
4427     Register LiveIn =
4428       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4429     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4430       return false;
4431 
4432     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4433     B.buildCopy(SGPR01, LiveIn);
4434     B.buildInstr(AMDGPU::S_TRAP)
4435         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4436         .addReg(SGPR01, RegState::Implicit);
4437   }
4438 
4439   MI.eraseFromParent();
4440   return true;
4441 }
4442 
4443 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4444     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4445   // Is non-HSA path or trap-handler disabled? then, report a warning
4446   // accordingly
4447   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4448       !ST.isTrapHandlerEnabled()) {
4449     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4450                                      "debugtrap handler not supported",
4451                                      MI.getDebugLoc(), DS_Warning);
4452     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4453     Ctx.diagnose(NoTrap);
4454   } else {
4455     // Insert debug-trap instruction
4456     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4457   }
4458 
4459   MI.eraseFromParent();
4460   return true;
4461 }
4462 
4463 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4464                                             MachineInstr &MI) const {
4465   MachineIRBuilder &B = Helper.MIRBuilder;
4466   MachineRegisterInfo &MRI = *B.getMRI();
4467 
4468   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4469   auto IntrID = MI.getIntrinsicID();
4470   switch (IntrID) {
4471   case Intrinsic::amdgcn_if:
4472   case Intrinsic::amdgcn_else: {
4473     MachineInstr *Br = nullptr;
4474     MachineBasicBlock *UncondBrTarget = nullptr;
4475     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4476       const SIRegisterInfo *TRI
4477         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4478 
4479       Register Def = MI.getOperand(1).getReg();
4480       Register Use = MI.getOperand(3).getReg();
4481 
4482       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4483       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4484       if (IntrID == Intrinsic::amdgcn_if) {
4485         B.buildInstr(AMDGPU::SI_IF)
4486           .addDef(Def)
4487           .addUse(Use)
4488           .addMBB(UncondBrTarget);
4489       } else {
4490         B.buildInstr(AMDGPU::SI_ELSE)
4491           .addDef(Def)
4492           .addUse(Use)
4493           .addMBB(UncondBrTarget)
4494           .addImm(0);
4495       }
4496 
4497       if (Br) {
4498         Br->getOperand(0).setMBB(CondBrTarget);
4499       } else {
4500         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4501         // since we're swapping branch targets it needs to be reinserted.
4502         // FIXME: IRTranslator should probably not do this
4503         B.buildBr(*CondBrTarget);
4504       }
4505 
4506       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4507       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4508       MI.eraseFromParent();
4509       BrCond->eraseFromParent();
4510       return true;
4511     }
4512 
4513     return false;
4514   }
4515   case Intrinsic::amdgcn_loop: {
4516     MachineInstr *Br = nullptr;
4517     MachineBasicBlock *UncondBrTarget = nullptr;
4518     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4519       const SIRegisterInfo *TRI
4520         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4521 
4522       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4523       Register Reg = MI.getOperand(2).getReg();
4524 
4525       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4526       B.buildInstr(AMDGPU::SI_LOOP)
4527         .addUse(Reg)
4528         .addMBB(UncondBrTarget);
4529 
4530       if (Br)
4531         Br->getOperand(0).setMBB(CondBrTarget);
4532       else
4533         B.buildBr(*CondBrTarget);
4534 
4535       MI.eraseFromParent();
4536       BrCond->eraseFromParent();
4537       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4538       return true;
4539     }
4540 
4541     return false;
4542   }
4543   case Intrinsic::amdgcn_kernarg_segment_ptr:
4544     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4545       // This only makes sense to call in a kernel, so just lower to null.
4546       B.buildConstant(MI.getOperand(0).getReg(), 0);
4547       MI.eraseFromParent();
4548       return true;
4549     }
4550 
4551     return legalizePreloadedArgIntrin(
4552       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4553   case Intrinsic::amdgcn_implicitarg_ptr:
4554     return legalizeImplicitArgPtr(MI, MRI, B);
4555   case Intrinsic::amdgcn_workitem_id_x:
4556     return legalizePreloadedArgIntrin(MI, MRI, B,
4557                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4558   case Intrinsic::amdgcn_workitem_id_y:
4559     return legalizePreloadedArgIntrin(MI, MRI, B,
4560                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4561   case Intrinsic::amdgcn_workitem_id_z:
4562     return legalizePreloadedArgIntrin(MI, MRI, B,
4563                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4564   case Intrinsic::amdgcn_workgroup_id_x:
4565     return legalizePreloadedArgIntrin(MI, MRI, B,
4566                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4567   case Intrinsic::amdgcn_workgroup_id_y:
4568     return legalizePreloadedArgIntrin(MI, MRI, B,
4569                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4570   case Intrinsic::amdgcn_workgroup_id_z:
4571     return legalizePreloadedArgIntrin(MI, MRI, B,
4572                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4573   case Intrinsic::amdgcn_dispatch_ptr:
4574     return legalizePreloadedArgIntrin(MI, MRI, B,
4575                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4576   case Intrinsic::amdgcn_queue_ptr:
4577     return legalizePreloadedArgIntrin(MI, MRI, B,
4578                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4579   case Intrinsic::amdgcn_implicit_buffer_ptr:
4580     return legalizePreloadedArgIntrin(
4581       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4582   case Intrinsic::amdgcn_dispatch_id:
4583     return legalizePreloadedArgIntrin(MI, MRI, B,
4584                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4585   case Intrinsic::amdgcn_fdiv_fast:
4586     return legalizeFDIVFastIntrin(MI, MRI, B);
4587   case Intrinsic::amdgcn_is_shared:
4588     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4589   case Intrinsic::amdgcn_is_private:
4590     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4591   case Intrinsic::amdgcn_wavefrontsize: {
4592     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4593     MI.eraseFromParent();
4594     return true;
4595   }
4596   case Intrinsic::amdgcn_s_buffer_load:
4597     return legalizeSBufferLoad(Helper, MI);
4598   case Intrinsic::amdgcn_raw_buffer_store:
4599   case Intrinsic::amdgcn_struct_buffer_store:
4600     return legalizeBufferStore(MI, MRI, B, false, false);
4601   case Intrinsic::amdgcn_raw_buffer_store_format:
4602   case Intrinsic::amdgcn_struct_buffer_store_format:
4603     return legalizeBufferStore(MI, MRI, B, false, true);
4604   case Intrinsic::amdgcn_raw_tbuffer_store:
4605   case Intrinsic::amdgcn_struct_tbuffer_store:
4606     return legalizeBufferStore(MI, MRI, B, true, true);
4607   case Intrinsic::amdgcn_raw_buffer_load:
4608   case Intrinsic::amdgcn_struct_buffer_load:
4609     return legalizeBufferLoad(MI, MRI, B, false, false);
4610   case Intrinsic::amdgcn_raw_buffer_load_format:
4611   case Intrinsic::amdgcn_struct_buffer_load_format:
4612     return legalizeBufferLoad(MI, MRI, B, true, false);
4613   case Intrinsic::amdgcn_raw_tbuffer_load:
4614   case Intrinsic::amdgcn_struct_tbuffer_load:
4615     return legalizeBufferLoad(MI, MRI, B, true, true);
4616   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4617   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4618   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4619   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4620   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4621   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4622   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4623   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4624   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4625   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4626   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4627   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4628   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4629   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4630   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4631   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4632   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4633   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4634   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4635   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4636   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4637   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4638   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4639   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4640   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4641   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4642   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4643   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4644     return legalizeBufferAtomic(MI, B, IntrID);
4645   case Intrinsic::amdgcn_atomic_inc:
4646     return legalizeAtomicIncDec(MI, B, true);
4647   case Intrinsic::amdgcn_atomic_dec:
4648     return legalizeAtomicIncDec(MI, B, false);
4649   case Intrinsic::trap:
4650     return legalizeTrapIntrinsic(MI, MRI, B);
4651   case Intrinsic::debugtrap:
4652     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4653   case Intrinsic::amdgcn_rsq_clamp:
4654     return legalizeRsqClampIntrinsic(MI, MRI, B);
4655   case Intrinsic::amdgcn_ds_fadd:
4656   case Intrinsic::amdgcn_ds_fmin:
4657   case Intrinsic::amdgcn_ds_fmax:
4658     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4659   default: {
4660     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4661             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4662       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4663     return true;
4664   }
4665   }
4666 
4667   return true;
4668 }
4669