1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99   };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105     const LLT EltTy = Ty.getElementType();
106     unsigned Size = Ty.getSizeInBits();
107     unsigned Pieces = (Size + 63) / 64;
108     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110   };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116   return [=](const LegalityQuery &Query) {
117     const LLT Ty = Query.Types[TypeIdx];
118 
119     const LLT EltTy = Ty.getElementType();
120     const int Size = Ty.getSizeInBits();
121     const int EltSize = EltTy.getSizeInBits();
122     const int NextMul32 = (Size + 31) / 32;
123 
124     assert(EltSize < 32);
125 
126     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128   };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132   const unsigned Size = Ty.getSizeInBits();
133 
134   LLT CoercedTy;
135   if (Size <= 32) {
136     // <2 x s8> -> s16
137     // <4 x s8> -> s32
138     return LLT::scalar(Size);
139   }
140 
141   return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT Ty = Query.Types[TypeIdx];
147     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148   };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152   return [=](const LegalityQuery &Query) {
153     const LLT Ty = Query.Types[TypeIdx];
154     unsigned Size = Ty.getSizeInBits();
155     assert(Size % 32 == 0);
156     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157   };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161   return [=](const LegalityQuery &Query) {
162     const LLT QueryTy = Query.Types[TypeIdx];
163     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164   };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171   };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178   };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182   return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186   const int EltSize = EltTy.getSizeInBits();
187   return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191   const int EltSize = Ty.getElementType().getSizeInBits();
192   return EltSize == 32 || EltSize == 64 ||
193          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194          EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198   if (!isRegisterSize(Ty.getSizeInBits()))
199     return false;
200 
201   if (Ty.isVector())
202     return isRegisterVectorType(Ty);
203 
204   return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     return isRegisterType(Query.Types[TypeIdx]);
212   };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216   return [=](const LegalityQuery &Query) {
217     const LLT QueryTy = Query.Types[TypeIdx];
218     if (!QueryTy.isVector())
219       return false;
220     const LLT EltTy = QueryTy.getElementType();
221     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222   };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226   return [=](const LegalityQuery &Query) {
227     const LLT Ty = Query.Types[TypeIdx];
228     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230   };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237                                     bool IsLoad) {
238   switch (AS) {
239   case AMDGPUAS::PRIVATE_ADDRESS:
240     // FIXME: Private element size.
241     return ST.enableFlatScratch() ? 128 : 32;
242   case AMDGPUAS::LOCAL_ADDRESS:
243     return ST.useDS128() ? 128 : 64;
244   case AMDGPUAS::GLOBAL_ADDRESS:
245   case AMDGPUAS::CONSTANT_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
247     // Treat constant and global as identical. SMRD loads are sometimes usable for
248     // global loads (ideally constant address space should be eliminated)
249     // depending on the context. Legality cannot be context dependent, but
250     // RegBankSelect can split the load as necessary depending on the pointer
251     // register bank/uniformity and if the memory is invariant or not written in a
252     // kernel.
253     return IsLoad ? 512 : 128;
254   default:
255     // Flat addresses may contextually need to be split to 32-bit parts if they
256     // may alias scratch depending on the subtarget.
257     return 128;
258   }
259 }
260 
261 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
262                                  const LegalityQuery &Query,
263                                  unsigned Opcode) {
264   const LLT Ty = Query.Types[0];
265 
266   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267   const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269   unsigned RegSize = Ty.getSizeInBits();
270   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272   unsigned AS = Query.Types[1].getAddressSpace();
273 
274   // All of these need to be custom lowered to cast the pointer operand.
275   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
276     return false;
277 
278   // TODO: We should be able to widen loads if the alignment is high enough, but
279   // we also need to modify the memory access size.
280 #if 0
281   // Accept widening loads based on alignment.
282   if (IsLoad && MemSize < Size)
283     MemSize = std::max(MemSize, Align);
284 #endif
285 
286   // Only 1-byte and 2-byte to 32-bit extloads are valid.
287   if (MemSize != RegSize && RegSize != 32)
288     return false;
289 
290   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291     return false;
292 
293   switch (MemSize) {
294   case 8:
295   case 16:
296   case 32:
297   case 64:
298   case 128:
299     break;
300   case 96:
301     if (!ST.hasDwordx3LoadStores())
302       return false;
303     break;
304   case 256:
305   case 512:
306     // These may contextually need to be broken down.
307     break;
308   default:
309     return false;
310   }
311 
312   assert(RegSize >= MemSize);
313 
314   if (AlignBits < MemSize) {
315     const SITargetLowering *TLI = ST.getTargetLowering();
316     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317                                                  Align(AlignBits / 8)))
318       return false;
319   }
320 
321   return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329   if (EnableNewLegality)
330     return false;
331 
332   const unsigned Size = Ty.getSizeInBits();
333   if (Size <= 64)
334     return false;
335   if (!Ty.isVector())
336     return true;
337 
338   LLT EltTy = Ty.getElementType();
339   if (EltTy.isPointer())
340     return true;
341 
342   unsigned EltSize = EltTy.getSizeInBits();
343   return EltSize != 32 && EltSize != 64;
344 }
345 
346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347                              unsigned Opcode) {
348   const LLT Ty = Query.Types[0];
349   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
350          !loadStoreBitcastWorkaround(Ty);
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356                                        const unsigned MemSizeInBits) {
357   const unsigned Size = Ty.getSizeInBits();
358     if (Size != MemSizeInBits)
359       return Size <= 32 && Ty.isVector();
360 
361   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
362     return true;
363   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
364          !isRegisterVectorElementType(Ty.getElementType());
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371                             unsigned AlignInBits, unsigned AddrSpace,
372                             unsigned Opcode) {
373   // We don't want to widen cases that are naturally legal.
374   if (isPowerOf2_32(SizeInBits))
375     return false;
376 
377   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378   // end up widening these for a scalar load during RegBankSelect, since there
379   // aren't 96-bit scalar loads.
380   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381     return false;
382 
383   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384     return false;
385 
386   // A load is known dereferenceable up to the alignment, so it's legal to widen
387   // to it.
388   //
389   // TODO: Could check dereferenceable for less aligned cases.
390   unsigned RoundedSize = NextPowerOf2(SizeInBits);
391   if (AlignInBits < RoundedSize)
392     return false;
393 
394   // Do not widen if it would introduce a slow unaligned load.
395   const SITargetLowering *TLI = ST.getTargetLowering();
396   bool Fast = false;
397   return TLI->allowsMisalignedMemoryAccessesImpl(
398              RoundedSize, AddrSpace, Align(AlignInBits / 8),
399              MachineMemOperand::MOLoad, &Fast) &&
400          Fast;
401 }
402 
403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404                             unsigned Opcode) {
405   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406     return false;
407 
408   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409                          Query.MMODescrs[0].AlignInBits,
410                          Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
413 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
414                                          const GCNTargetMachine &TM)
415   :  ST(ST_) {
416   using namespace TargetOpcode;
417 
418   auto GetAddrSpacePtr = [&TM](unsigned AS) {
419     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420   };
421 
422   const LLT S1 = LLT::scalar(1);
423   const LLT S8 = LLT::scalar(8);
424   const LLT S16 = LLT::scalar(16);
425   const LLT S32 = LLT::scalar(32);
426   const LLT S64 = LLT::scalar(64);
427   const LLT S128 = LLT::scalar(128);
428   const LLT S256 = LLT::scalar(256);
429   const LLT S512 = LLT::scalar(512);
430   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432   const LLT V2S8 = LLT::vector(2, 8);
433   const LLT V2S16 = LLT::vector(2, 16);
434   const LLT V4S16 = LLT::vector(4, 16);
435 
436   const LLT V2S32 = LLT::vector(2, 32);
437   const LLT V3S32 = LLT::vector(3, 32);
438   const LLT V4S32 = LLT::vector(4, 32);
439   const LLT V5S32 = LLT::vector(5, 32);
440   const LLT V6S32 = LLT::vector(6, 32);
441   const LLT V7S32 = LLT::vector(7, 32);
442   const LLT V8S32 = LLT::vector(8, 32);
443   const LLT V9S32 = LLT::vector(9, 32);
444   const LLT V10S32 = LLT::vector(10, 32);
445   const LLT V11S32 = LLT::vector(11, 32);
446   const LLT V12S32 = LLT::vector(12, 32);
447   const LLT V13S32 = LLT::vector(13, 32);
448   const LLT V14S32 = LLT::vector(14, 32);
449   const LLT V15S32 = LLT::vector(15, 32);
450   const LLT V16S32 = LLT::vector(16, 32);
451   const LLT V32S32 = LLT::vector(32, 32);
452 
453   const LLT V2S64 = LLT::vector(2, 64);
454   const LLT V3S64 = LLT::vector(3, 64);
455   const LLT V4S64 = LLT::vector(4, 64);
456   const LLT V5S64 = LLT::vector(5, 64);
457   const LLT V6S64 = LLT::vector(6, 64);
458   const LLT V7S64 = LLT::vector(7, 64);
459   const LLT V8S64 = LLT::vector(8, 64);
460   const LLT V16S64 = LLT::vector(16, 64);
461 
462   std::initializer_list<LLT> AllS32Vectors =
463     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465   std::initializer_list<LLT> AllS64Vectors =
466     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476   const LLT CodePtr = FlatPtr;
477 
478   const std::initializer_list<LLT> AddrSpaces64 = {
479     GlobalPtr, ConstantPtr, FlatPtr
480   };
481 
482   const std::initializer_list<LLT> AddrSpaces32 = {
483     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484   };
485 
486   const std::initializer_list<LLT> FPTypesBase = {
487     S32, S64
488   };
489 
490   const std::initializer_list<LLT> FPTypes16 = {
491     S32, S64, S16
492   };
493 
494   const std::initializer_list<LLT> FPTypesPK16 = {
495     S32, S64, S16, V2S16
496   };
497 
498   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500   setAction({G_BRCOND, S1}, Legal); // VCC branches
501   setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504   // elements for v3s16
505   getActionDefinitionsBuilder(G_PHI)
506     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507     .legalFor(AllS32Vectors)
508     .legalFor(AllS64Vectors)
509     .legalFor(AddrSpaces64)
510     .legalFor(AddrSpaces32)
511     .legalIf(isPointer(0))
512     .clampScalar(0, S16, S256)
513     .widenScalarToNextPow2(0, 32)
514     .clampMaxNumElements(0, S32, 16)
515     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516     .scalarize(0);
517 
518   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519     // Full set of gfx9 features.
520     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521       .legalFor({S32, S16, V2S16})
522       .clampScalar(0, S16, S32)
523       .clampMaxNumElements(0, S16, 2)
524       .scalarize(0)
525       .widenScalarToNextPow2(0, 32);
526 
527     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528       .legalFor({S32, S16, V2S16}) // Clamp modifier
529       .minScalarOrElt(0, S16)
530       .clampMaxNumElements(0, S16, 2)
531       .scalarize(0)
532       .widenScalarToNextPow2(0, 32)
533       .lower();
534   } else if (ST.has16BitInsts()) {
535     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536       .legalFor({S32, S16})
537       .clampScalar(0, S16, S32)
538       .scalarize(0)
539       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541     // Technically the saturating operations require clamp bit support, but this
542     // was introduced at the same time as 16-bit operations.
543     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544       .legalFor({S32, S16}) // Clamp modifier
545       .minScalar(0, S16)
546       .scalarize(0)
547       .widenScalarToNextPow2(0, 16)
548       .lower();
549 
550     // We're just lowering this, but it helps get a better result to try to
551     // coerce to the desired type first.
552     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553       .minScalar(0, S16)
554       .scalarize(0)
555       .lower();
556   } else {
557     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558       .legalFor({S32})
559       .clampScalar(0, S32, S32)
560       .scalarize(0);
561 
562     if (ST.hasIntClamp()) {
563       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564         .legalFor({S32}) // Clamp modifier.
565         .scalarize(0)
566         .minScalarOrElt(0, S32)
567         .lower();
568     } else {
569       // Clamp bit support was added in VI, along with 16-bit operations.
570       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571         .minScalar(0, S32)
572         .scalarize(0)
573         .lower();
574     }
575 
576     // FIXME: DAG expansion gets better results. The widening uses the smaller
577     // range values and goes for the min/max lowering directly.
578     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579       .minScalar(0, S32)
580       .scalarize(0)
581       .lower();
582   }
583 
584   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585     .customFor({S32, S64})
586     .clampScalar(0, S32, S64)
587     .widenScalarToNextPow2(0, 32)
588     .scalarize(0);
589 
590   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591                    .legalFor({S32})
592                    .maxScalarOrElt(0, S32);
593 
594   if (ST.hasVOP3PInsts()) {
595     Mulh
596       .clampMaxNumElements(0, S8, 2)
597       .lowerFor({V2S8});
598   }
599 
600   Mulh
601     .scalarize(0)
602     .lower();
603 
604   // Report legal for any types we can handle anywhere. For the cases only legal
605   // on the SALU, RegBankSelect will be able to re-legalize.
606   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608     .clampScalar(0, S32, S64)
609     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
610     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
611     .widenScalarToNextPow2(0)
612     .scalarize(0);
613 
614   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616     .legalFor({{S32, S1}, {S32, S32}})
617     .minScalar(0, S32)
618     // TODO: .scalarize(0)
619     .lower();
620 
621   getActionDefinitionsBuilder(G_BITCAST)
622     // Don't worry about the size constraint.
623     .legalIf(all(isRegisterType(0), isRegisterType(1)))
624     .lower();
625 
626 
627   getActionDefinitionsBuilder(G_CONSTANT)
628     .legalFor({S1, S32, S64, S16, GlobalPtr,
629                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630     .legalIf(isPointer(0))
631     .clampScalar(0, S32, S64)
632     .widenScalarToNextPow2(0);
633 
634   getActionDefinitionsBuilder(G_FCONSTANT)
635     .legalFor({S32, S64, S16})
636     .clampScalar(0, S16, S64);
637 
638   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639       .legalIf(isRegisterType(0))
640       // s1 and s16 are special cases because they have legal operations on
641       // them, but don't really occupy registers in the normal way.
642       .legalFor({S1, S16})
643       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644       .clampScalarOrElt(0, S32, MaxScalar)
645       .widenScalarToNextPow2(0, 32)
646       .clampMaxNumElements(0, S32, 16);
647 
648   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649 
650   // If the amount is divergent, we have to do a wave reduction to get the
651   // maximum value, so this is expanded during RegBankSelect.
652   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653     .legalFor({{PrivatePtr, S32}});
654 
655   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656     .customIf(typeIsNot(0, PrivatePtr));
657 
658   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659 
660   auto &FPOpActions = getActionDefinitionsBuilder(
661     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662     .legalFor({S32, S64});
663   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664     .customFor({S32, S64});
665   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666     .customFor({S32, S64});
667 
668   if (ST.has16BitInsts()) {
669     if (ST.hasVOP3PInsts())
670       FPOpActions.legalFor({S16, V2S16});
671     else
672       FPOpActions.legalFor({S16});
673 
674     TrigActions.customFor({S16});
675     FDIVActions.customFor({S16});
676   }
677 
678   auto &MinNumMaxNum = getActionDefinitionsBuilder({
679       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680 
681   if (ST.hasVOP3PInsts()) {
682     MinNumMaxNum.customFor(FPTypesPK16)
683       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
684       .clampMaxNumElements(0, S16, 2)
685       .clampScalar(0, S16, S64)
686       .scalarize(0);
687   } else if (ST.has16BitInsts()) {
688     MinNumMaxNum.customFor(FPTypes16)
689       .clampScalar(0, S16, S64)
690       .scalarize(0);
691   } else {
692     MinNumMaxNum.customFor(FPTypesBase)
693       .clampScalar(0, S32, S64)
694       .scalarize(0);
695   }
696 
697   if (ST.hasVOP3PInsts())
698     FPOpActions.clampMaxNumElements(0, S16, 2);
699 
700   FPOpActions
701     .scalarize(0)
702     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703 
704   TrigActions
705     .scalarize(0)
706     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707 
708   FDIVActions
709     .scalarize(0)
710     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 
712   getActionDefinitionsBuilder({G_FNEG, G_FABS})
713     .legalFor(FPTypesPK16)
714     .clampMaxNumElements(0, S16, 2)
715     .scalarize(0)
716     .clampScalar(0, S16, S64);
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720       .legalFor({S32, S64, S16})
721       .scalarize(0)
722       .clampScalar(0, S16, S64);
723   } else {
724     getActionDefinitionsBuilder(G_FSQRT)
725       .legalFor({S32, S64})
726       .scalarize(0)
727       .clampScalar(0, S32, S64);
728 
729     if (ST.hasFractBug()) {
730       getActionDefinitionsBuilder(G_FFLOOR)
731         .customFor({S64})
732         .legalFor({S32, S64})
733         .scalarize(0)
734         .clampScalar(0, S32, S64);
735     } else {
736       getActionDefinitionsBuilder(G_FFLOOR)
737         .legalFor({S32, S64})
738         .scalarize(0)
739         .clampScalar(0, S32, S64);
740     }
741   }
742 
743   getActionDefinitionsBuilder(G_FPTRUNC)
744     .legalFor({{S32, S64}, {S16, S32}})
745     .scalarize(0)
746     .lower();
747 
748   getActionDefinitionsBuilder(G_FPEXT)
749     .legalFor({{S64, S32}, {S32, S16}})
750     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751     .scalarize(0);
752 
753   getActionDefinitionsBuilder(G_FSUB)
754       // Use actual fsub instruction
755       .legalFor({S32})
756       // Must use fadd + fneg
757       .lowerFor({S64, S16, V2S16})
758       .scalarize(0)
759       .clampScalar(0, S32, S64);
760 
761   // Whether this is legal depends on the floating point mode for the function.
762   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764     FMad.customFor({S32, S16});
765   else if (ST.hasMadMacF32Insts())
766     FMad.customFor({S32});
767   else if (ST.hasMadF16())
768     FMad.customFor({S16});
769   FMad.scalarize(0)
770       .lower();
771 
772   auto &FRem = getActionDefinitionsBuilder(G_FREM);
773   if (ST.has16BitInsts()) {
774     FRem.customFor({S16, S32, S64});
775   } else {
776     FRem.minScalar(0, S32)
777         .customFor({S32, S64});
778   }
779   FRem.scalarize(0);
780 
781   // TODO: Do we need to clamp maximum bitwidth?
782   getActionDefinitionsBuilder(G_TRUNC)
783     .legalIf(isScalar(0))
784     .legalFor({{V2S16, V2S32}})
785     .clampMaxNumElements(0, S16, 2)
786     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787     // situations (like an invalid implicit use), we don't want to infinite loop
788     // in the legalizer.
789     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
790     .alwaysLegal();
791 
792   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794                {S32, S1}, {S64, S1}, {S16, S1}})
795     .scalarize(0)
796     .clampScalar(0, S32, S64)
797     .widenScalarToNextPow2(1, 32);
798 
799   // TODO: Split s1->s64 during regbankselect for VALU.
800   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802     .lowerFor({{S32, S64}})
803     .lowerIf(typeIs(1, S1))
804     .customFor({{S64, S64}});
805   if (ST.has16BitInsts())
806     IToFP.legalFor({{S16, S16}});
807   IToFP.clampScalar(1, S32, S64)
808        .minScalar(0, S32)
809        .scalarize(0)
810        .widenScalarToNextPow2(1);
811 
812   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814     .customFor({{S64, S64}})
815     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816   if (ST.has16BitInsts())
817     FPToI.legalFor({{S16, S16}});
818   else
819     FPToI.minScalar(1, S32);
820 
821   FPToI.minScalar(0, S32)
822        .widenScalarToNextPow2(0, 32)
823        .scalarize(0)
824        .lower();
825 
826   // Lower roundeven into G_FRINT
827   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
828     .scalarize(0)
829     .lower();
830 
831   if (ST.has16BitInsts()) {
832     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
833       .legalFor({S16, S32, S64})
834       .clampScalar(0, S16, S64)
835       .scalarize(0);
836   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
837     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838       .legalFor({S32, S64})
839       .clampScalar(0, S32, S64)
840       .scalarize(0);
841   } else {
842     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843       .legalFor({S32})
844       .customFor({S64})
845       .clampScalar(0, S32, S64)
846       .scalarize(0);
847   }
848 
849   getActionDefinitionsBuilder(G_PTR_ADD)
850     .legalIf(all(isPointer(0), sameSize(0, 1)))
851     .scalarize(0)
852     .scalarSameSizeAs(1, 0);
853 
854   getActionDefinitionsBuilder(G_PTRMASK)
855     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
856     .scalarSameSizeAs(1, 0)
857     .scalarize(0);
858 
859   auto &CmpBuilder =
860     getActionDefinitionsBuilder(G_ICMP)
861     // The compare output type differs based on the register bank of the output,
862     // so make both s1 and s32 legal.
863     //
864     // Scalar compares producing output in scc will be promoted to s32, as that
865     // is the allocatable register type that will be needed for the copy from
866     // scc. This will be promoted during RegBankSelect, and we assume something
867     // before that won't try to use s32 result types.
868     //
869     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
870     // bank.
871     .legalForCartesianProduct(
872       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
873     .legalForCartesianProduct(
874       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
875   if (ST.has16BitInsts()) {
876     CmpBuilder.legalFor({{S1, S16}});
877   }
878 
879   CmpBuilder
880     .widenScalarToNextPow2(1)
881     .clampScalar(1, S32, S64)
882     .scalarize(0)
883     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
884 
885   getActionDefinitionsBuilder(G_FCMP)
886     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
887     .widenScalarToNextPow2(1)
888     .clampScalar(1, S32, S64)
889     .scalarize(0);
890 
891   // FIXME: fpow has a selection pattern that should move to custom lowering.
892   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
893   if (ST.has16BitInsts())
894     Exp2Ops.legalFor({S32, S16});
895   else
896     Exp2Ops.legalFor({S32});
897   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
898   Exp2Ops.scalarize(0);
899 
900   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
901   if (ST.has16BitInsts())
902     ExpOps.customFor({{S32}, {S16}});
903   else
904     ExpOps.customFor({S32});
905   ExpOps.clampScalar(0, MinScalarFPTy, S32)
906         .scalarize(0);
907 
908   getActionDefinitionsBuilder(G_FPOWI)
909     .clampScalar(0, MinScalarFPTy, S32)
910     .lower();
911 
912   // The 64-bit versions produce 32-bit results, but only on the SALU.
913   getActionDefinitionsBuilder(G_CTPOP)
914     .legalFor({{S32, S32}, {S32, S64}})
915     .clampScalar(0, S32, S32)
916     .clampScalar(1, S32, S64)
917     .scalarize(0)
918     .widenScalarToNextPow2(0, 32)
919     .widenScalarToNextPow2(1, 32);
920 
921   // The hardware instructions return a different result on 0 than the generic
922   // instructions expect. The hardware produces -1, but these produce the
923   // bitwidth.
924   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
925     .scalarize(0)
926     .clampScalar(0, S32, S32)
927     .clampScalar(1, S32, S64)
928     .widenScalarToNextPow2(0, 32)
929     .widenScalarToNextPow2(1, 32)
930     .lower();
931 
932   // The 64-bit versions produce 32-bit results, but only on the SALU.
933   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
934     .legalFor({{S32, S32}, {S32, S64}})
935     .clampScalar(0, S32, S32)
936     .clampScalar(1, S32, S64)
937     .scalarize(0)
938     .widenScalarToNextPow2(0, 32)
939     .widenScalarToNextPow2(1, 32);
940 
941   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
942   // RegBankSelect.
943   getActionDefinitionsBuilder(G_BITREVERSE)
944     .legalFor({S32, S64})
945     .clampScalar(0, S32, S64)
946     .scalarize(0)
947     .widenScalarToNextPow2(0);
948 
949   if (ST.has16BitInsts()) {
950     getActionDefinitionsBuilder(G_BSWAP)
951       .legalFor({S16, S32, V2S16})
952       .clampMaxNumElements(0, S16, 2)
953       // FIXME: Fixing non-power-of-2 before clamp is workaround for
954       // narrowScalar limitation.
955       .widenScalarToNextPow2(0)
956       .clampScalar(0, S16, S32)
957       .scalarize(0);
958 
959     if (ST.hasVOP3PInsts()) {
960       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
961         .legalFor({S32, S16, V2S16})
962         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
963         .clampMaxNumElements(0, S16, 2)
964         .minScalar(0, S16)
965         .widenScalarToNextPow2(0)
966         .scalarize(0)
967         .lower();
968     } else {
969       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970         .legalFor({S32, S16})
971         .widenScalarToNextPow2(0)
972         .minScalar(0, S16)
973         .scalarize(0)
974         .lower();
975     }
976   } else {
977     // TODO: Should have same legality without v_perm_b32
978     getActionDefinitionsBuilder(G_BSWAP)
979       .legalFor({S32})
980       .lowerIf(scalarNarrowerThan(0, 32))
981       // FIXME: Fixing non-power-of-2 before clamp is workaround for
982       // narrowScalar limitation.
983       .widenScalarToNextPow2(0)
984       .maxScalar(0, S32)
985       .scalarize(0)
986       .lower();
987 
988     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
989       .legalFor({S32})
990       .minScalar(0, S32)
991       .widenScalarToNextPow2(0)
992       .scalarize(0)
993       .lower();
994   }
995 
996   getActionDefinitionsBuilder(G_INTTOPTR)
997     // List the common cases
998     .legalForCartesianProduct(AddrSpaces64, {S64})
999     .legalForCartesianProduct(AddrSpaces32, {S32})
1000     .scalarize(0)
1001     // Accept any address space as long as the size matches
1002     .legalIf(sameSize(0, 1))
1003     .widenScalarIf(smallerThan(1, 0),
1004       [](const LegalityQuery &Query) {
1005         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1006       })
1007     .narrowScalarIf(largerThan(1, 0),
1008       [](const LegalityQuery &Query) {
1009         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1010       });
1011 
1012   getActionDefinitionsBuilder(G_PTRTOINT)
1013     // List the common cases
1014     .legalForCartesianProduct(AddrSpaces64, {S64})
1015     .legalForCartesianProduct(AddrSpaces32, {S32})
1016     .scalarize(0)
1017     // Accept any address space as long as the size matches
1018     .legalIf(sameSize(0, 1))
1019     .widenScalarIf(smallerThan(0, 1),
1020       [](const LegalityQuery &Query) {
1021         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1022       })
1023     .narrowScalarIf(
1024       largerThan(0, 1),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027       });
1028 
1029   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1030     .scalarize(0)
1031     .custom();
1032 
1033   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1034                                     bool IsLoad) -> bool {
1035     const LLT DstTy = Query.Types[0];
1036 
1037     // Split vector extloads.
1038     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1039     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1040 
1041     if (MemSize < DstTy.getSizeInBits())
1042       MemSize = std::max(MemSize, AlignBits);
1043 
1044     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1045       return true;
1046 
1047     const LLT PtrTy = Query.Types[1];
1048     unsigned AS = PtrTy.getAddressSpace();
1049     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1050       return true;
1051 
1052     // Catch weird sized loads that don't evenly divide into the access sizes
1053     // TODO: May be able to widen depending on alignment etc.
1054     unsigned NumRegs = (MemSize + 31) / 32;
1055     if (NumRegs == 3) {
1056       if (!ST.hasDwordx3LoadStores())
1057         return true;
1058     } else {
1059       // If the alignment allows, these should have been widened.
1060       if (!isPowerOf2_32(NumRegs))
1061         return true;
1062     }
1063 
1064     if (AlignBits < MemSize) {
1065       const SITargetLowering *TLI = ST.getTargetLowering();
1066       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1067                                                       Align(AlignBits / 8));
1068     }
1069 
1070     return false;
1071   };
1072 
1073   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1074   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1075   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1076 
1077   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1078   // LDS
1079   // TODO: Unsupported flat for SI.
1080 
1081   for (unsigned Op : {G_LOAD, G_STORE}) {
1082     const bool IsStore = Op == G_STORE;
1083 
1084     auto &Actions = getActionDefinitionsBuilder(Op);
1085     // Explicitly list some common cases.
1086     // TODO: Does this help compile time at all?
1087     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1088                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1089                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1090                                       {S64, GlobalPtr, 64, GlobalAlign32},
1091                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1092                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1093                                       {S32, GlobalPtr, 8, GlobalAlign8},
1094                                       {S32, GlobalPtr, 16, GlobalAlign16},
1095 
1096                                       {S32, LocalPtr, 32, 32},
1097                                       {S64, LocalPtr, 64, 32},
1098                                       {V2S32, LocalPtr, 64, 32},
1099                                       {S32, LocalPtr, 8, 8},
1100                                       {S32, LocalPtr, 16, 16},
1101                                       {V2S16, LocalPtr, 32, 32},
1102 
1103                                       {S32, PrivatePtr, 32, 32},
1104                                       {S32, PrivatePtr, 8, 8},
1105                                       {S32, PrivatePtr, 16, 16},
1106                                       {V2S16, PrivatePtr, 32, 32},
1107 
1108                                       {S32, ConstantPtr, 32, GlobalAlign32},
1109                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1110                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1111                                       {S64, ConstantPtr, 64, GlobalAlign32},
1112                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1113     Actions.legalIf(
1114       [=](const LegalityQuery &Query) -> bool {
1115         return isLoadStoreLegal(ST, Query, Op);
1116       });
1117 
1118     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1119     // 64-bits.
1120     //
1121     // TODO: Should generalize bitcast action into coerce, which will also cover
1122     // inserting addrspacecasts.
1123     Actions.customIf(typeIs(1, Constant32Ptr));
1124 
1125     // Turn any illegal element vectors into something easier to deal
1126     // with. These will ultimately produce 32-bit scalar shifts to extract the
1127     // parts anyway.
1128     //
1129     // For odd 16-bit element vectors, prefer to split those into pieces with
1130     // 16-bit vector parts.
1131     Actions.bitcastIf(
1132       [=](const LegalityQuery &Query) -> bool {
1133         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1134                                           Query.MMODescrs[0].SizeInBits);
1135       }, bitcastToRegisterType(0));
1136 
1137     if (!IsStore) {
1138       // Widen suitably aligned loads by loading extra bytes. The standard
1139       // legalization actions can't properly express widening memory operands.
1140       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1141         return shouldWidenLoad(ST, Query, G_LOAD);
1142       });
1143     }
1144 
1145     // FIXME: load/store narrowing should be moved to lower action
1146     Actions
1147         .narrowScalarIf(
1148             [=](const LegalityQuery &Query) -> bool {
1149               return !Query.Types[0].isVector() &&
1150                      needToSplitMemOp(Query, Op == G_LOAD);
1151             },
1152             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1153               const LLT DstTy = Query.Types[0];
1154               const LLT PtrTy = Query.Types[1];
1155 
1156               const unsigned DstSize = DstTy.getSizeInBits();
1157               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1158 
1159               // Split extloads.
1160               if (DstSize > MemSize)
1161                 return std::make_pair(0, LLT::scalar(MemSize));
1162 
1163               if (!isPowerOf2_32(DstSize)) {
1164                 // We're probably decomposing an odd sized store. Try to split
1165                 // to the widest type. TODO: Account for alignment. As-is it
1166                 // should be OK, since the new parts will be further legalized.
1167                 unsigned FloorSize = PowerOf2Floor(DstSize);
1168                 return std::make_pair(0, LLT::scalar(FloorSize));
1169               }
1170 
1171               if (DstSize > 32 && (DstSize % 32 != 0)) {
1172                 // FIXME: Need a way to specify non-extload of larger size if
1173                 // suitably aligned.
1174                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1175               }
1176 
1177               unsigned MaxSize = maxSizeForAddrSpace(ST,
1178                                                      PtrTy.getAddressSpace(),
1179                                                      Op == G_LOAD);
1180               if (MemSize > MaxSize)
1181                 return std::make_pair(0, LLT::scalar(MaxSize));
1182 
1183               unsigned Align = Query.MMODescrs[0].AlignInBits;
1184               return std::make_pair(0, LLT::scalar(Align));
1185             })
1186         .fewerElementsIf(
1187             [=](const LegalityQuery &Query) -> bool {
1188               return Query.Types[0].isVector() &&
1189                      needToSplitMemOp(Query, Op == G_LOAD);
1190             },
1191             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1192               const LLT DstTy = Query.Types[0];
1193               const LLT PtrTy = Query.Types[1];
1194 
1195               LLT EltTy = DstTy.getElementType();
1196               unsigned MaxSize = maxSizeForAddrSpace(ST,
1197                                                      PtrTy.getAddressSpace(),
1198                                                      Op == G_LOAD);
1199 
1200               // FIXME: Handle widened to power of 2 results better. This ends
1201               // up scalarizing.
1202               // FIXME: 3 element stores scalarized on SI
1203 
1204               // Split if it's too large for the address space.
1205               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1206                 unsigned NumElts = DstTy.getNumElements();
1207                 unsigned EltSize = EltTy.getSizeInBits();
1208 
1209                 if (MaxSize % EltSize == 0) {
1210                   return std::make_pair(
1211                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1212                 }
1213 
1214                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1215 
1216                 // FIXME: Refine when odd breakdowns handled
1217                 // The scalars will need to be re-legalized.
1218                 if (NumPieces == 1 || NumPieces >= NumElts ||
1219                     NumElts % NumPieces != 0)
1220                   return std::make_pair(0, EltTy);
1221 
1222                 return std::make_pair(0,
1223                                       LLT::vector(NumElts / NumPieces, EltTy));
1224               }
1225 
1226               // FIXME: We could probably handle weird extending loads better.
1227               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1228               if (DstTy.getSizeInBits() > MemSize)
1229                 return std::make_pair(0, EltTy);
1230 
1231               unsigned EltSize = EltTy.getSizeInBits();
1232               unsigned DstSize = DstTy.getSizeInBits();
1233               if (!isPowerOf2_32(DstSize)) {
1234                 // We're probably decomposing an odd sized store. Try to split
1235                 // to the widest type. TODO: Account for alignment. As-is it
1236                 // should be OK, since the new parts will be further legalized.
1237                 unsigned FloorSize = PowerOf2Floor(DstSize);
1238                 return std::make_pair(
1239                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1240               }
1241 
1242               // Need to split because of alignment.
1243               unsigned Align = Query.MMODescrs[0].AlignInBits;
1244               if (EltSize > Align &&
1245                   (EltSize / Align < DstTy.getNumElements())) {
1246                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1247               }
1248 
1249               // May need relegalization for the scalars.
1250               return std::make_pair(0, EltTy);
1251             })
1252     .lowerIfMemSizeNotPow2()
1253     .minScalar(0, S32);
1254 
1255     if (IsStore)
1256       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1257 
1258     Actions
1259         .widenScalarToNextPow2(0)
1260         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1261         .lower();
1262   }
1263 
1264   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1265                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1266                                                   {S32, GlobalPtr, 16, 2 * 8},
1267                                                   {S32, LocalPtr, 8, 8},
1268                                                   {S32, LocalPtr, 16, 16},
1269                                                   {S32, PrivatePtr, 8, 8},
1270                                                   {S32, PrivatePtr, 16, 16},
1271                                                   {S32, ConstantPtr, 8, 8},
1272                                                   {S32, ConstantPtr, 16, 2 * 8}});
1273   if (ST.hasFlatAddressSpace()) {
1274     ExtLoads.legalForTypesWithMemDesc(
1275         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1276   }
1277 
1278   ExtLoads.clampScalar(0, S32, S32)
1279           .widenScalarToNextPow2(0)
1280           .unsupportedIfMemSizeNotPow2()
1281           .lower();
1282 
1283   auto &Atomics = getActionDefinitionsBuilder(
1284     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287      G_ATOMICRMW_UMIN})
1288     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289                {S64, GlobalPtr}, {S64, LocalPtr},
1290                {S32, RegionPtr}, {S64, RegionPtr}});
1291   if (ST.hasFlatAddressSpace()) {
1292     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293   }
1294 
1295   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296   if (ST.hasLDSFPAtomics()) {
1297     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298     if (ST.hasGFX90AInsts())
1299       Atomic.legalFor({{S64, LocalPtr}});
1300   }
1301   if (ST.hasAtomicFaddInsts())
1302     Atomic.legalFor({{S32, GlobalPtr}});
1303 
1304   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1305   // demarshalling
1306   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1307     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1308                 {S32, FlatPtr}, {S64, FlatPtr}})
1309     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1310                {S32, RegionPtr}, {S64, RegionPtr}});
1311   // TODO: Pointer types, any 32-bit or 64-bit vector
1312 
1313   // Condition should be s32 for scalar, s1 for vector.
1314   getActionDefinitionsBuilder(G_SELECT)
1315     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1316           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1317           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1318     .clampScalar(0, S16, S64)
1319     .scalarize(1)
1320     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1321     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1322     .clampMaxNumElements(0, S32, 2)
1323     .clampMaxNumElements(0, LocalPtr, 2)
1324     .clampMaxNumElements(0, PrivatePtr, 2)
1325     .scalarize(0)
1326     .widenScalarToNextPow2(0)
1327     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1328 
1329   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1330   // be more flexible with the shift amount type.
1331   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1332     .legalFor({{S32, S32}, {S64, S32}});
1333   if (ST.has16BitInsts()) {
1334     if (ST.hasVOP3PInsts()) {
1335       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1336             .clampMaxNumElements(0, S16, 2);
1337     } else
1338       Shifts.legalFor({{S16, S16}});
1339 
1340     // TODO: Support 16-bit shift amounts for all types
1341     Shifts.widenScalarIf(
1342       [=](const LegalityQuery &Query) {
1343         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1344         // 32-bit amount.
1345         const LLT ValTy = Query.Types[0];
1346         const LLT AmountTy = Query.Types[1];
1347         return ValTy.getSizeInBits() <= 16 &&
1348                AmountTy.getSizeInBits() < 16;
1349       }, changeTo(1, S16));
1350     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1351     Shifts.clampScalar(1, S32, S32);
1352     Shifts.clampScalar(0, S16, S64);
1353     Shifts.widenScalarToNextPow2(0, 16);
1354 
1355     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1356       .minScalar(0, S16)
1357       .scalarize(0)
1358       .lower();
1359   } else {
1360     // Make sure we legalize the shift amount type first, as the general
1361     // expansion for the shifted type will produce much worse code if it hasn't
1362     // been truncated already.
1363     Shifts.clampScalar(1, S32, S32);
1364     Shifts.clampScalar(0, S32, S64);
1365     Shifts.widenScalarToNextPow2(0, 32);
1366 
1367     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1368       .minScalar(0, S32)
1369       .scalarize(0)
1370       .lower();
1371   }
1372   Shifts.scalarize(0);
1373 
1374   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1375     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1376     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1377     unsigned IdxTypeIdx = 2;
1378 
1379     getActionDefinitionsBuilder(Op)
1380       .customIf([=](const LegalityQuery &Query) {
1381           const LLT EltTy = Query.Types[EltTypeIdx];
1382           const LLT VecTy = Query.Types[VecTypeIdx];
1383           const LLT IdxTy = Query.Types[IdxTypeIdx];
1384           const unsigned EltSize = EltTy.getSizeInBits();
1385           return (EltSize == 32 || EltSize == 64) &&
1386                   VecTy.getSizeInBits() % 32 == 0 &&
1387                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1388                   IdxTy.getSizeInBits() == 32;
1389         })
1390       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1391                  bitcastToVectorElement32(VecTypeIdx))
1392       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1393       .bitcastIf(
1394         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1395         [=](const LegalityQuery &Query) {
1396           // For > 64-bit element types, try to turn this into a 64-bit
1397           // element vector since we may be able to do better indexing
1398           // if this is scalar. If not, fall back to 32.
1399           const LLT EltTy = Query.Types[EltTypeIdx];
1400           const LLT VecTy = Query.Types[VecTypeIdx];
1401           const unsigned DstEltSize = EltTy.getSizeInBits();
1402           const unsigned VecSize = VecTy.getSizeInBits();
1403 
1404           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1405           return std::make_pair(
1406             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1407         })
1408       .clampScalar(EltTypeIdx, S32, S64)
1409       .clampScalar(VecTypeIdx, S32, S64)
1410       .clampScalar(IdxTypeIdx, S32, S32)
1411       .clampMaxNumElements(VecTypeIdx, S32, 32)
1412       // TODO: Clamp elements for 64-bit vectors?
1413       // It should only be necessary with variable indexes.
1414       // As a last resort, lower to the stack
1415       .lower();
1416   }
1417 
1418   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1419     .unsupportedIf([=](const LegalityQuery &Query) {
1420         const LLT &EltTy = Query.Types[1].getElementType();
1421         return Query.Types[0] != EltTy;
1422       });
1423 
1424   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1425     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1426     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1427 
1428     // FIXME: Doesn't handle extract of illegal sizes.
1429     getActionDefinitionsBuilder(Op)
1430       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1431       // FIXME: Multiples of 16 should not be legal.
1432       .legalIf([=](const LegalityQuery &Query) {
1433           const LLT BigTy = Query.Types[BigTyIdx];
1434           const LLT LitTy = Query.Types[LitTyIdx];
1435           return (BigTy.getSizeInBits() % 32 == 0) &&
1436                  (LitTy.getSizeInBits() % 16 == 0);
1437         })
1438       .widenScalarIf(
1439         [=](const LegalityQuery &Query) {
1440           const LLT BigTy = Query.Types[BigTyIdx];
1441           return (BigTy.getScalarSizeInBits() < 16);
1442         },
1443         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1444       .widenScalarIf(
1445         [=](const LegalityQuery &Query) {
1446           const LLT LitTy = Query.Types[LitTyIdx];
1447           return (LitTy.getScalarSizeInBits() < 16);
1448         },
1449         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1450       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1451       .widenScalarToNextPow2(BigTyIdx, 32);
1452 
1453   }
1454 
1455   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1456     .legalForCartesianProduct(AllS32Vectors, {S32})
1457     .legalForCartesianProduct(AllS64Vectors, {S64})
1458     .clampNumElements(0, V16S32, V32S32)
1459     .clampNumElements(0, V2S64, V16S64)
1460     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1461 
1462   if (ST.hasScalarPackInsts()) {
1463     BuildVector
1464       // FIXME: Should probably widen s1 vectors straight to s32
1465       .minScalarOrElt(0, S16)
1466       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1467       .minScalar(1, S32);
1468 
1469     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1470       .legalFor({V2S16, S32})
1471       .lower();
1472     BuildVector.minScalarOrElt(0, S32);
1473   } else {
1474     BuildVector.customFor({V2S16, S16});
1475     BuildVector.minScalarOrElt(0, S32);
1476 
1477     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1478       .customFor({V2S16, S32})
1479       .lower();
1480   }
1481 
1482   BuildVector.legalIf(isRegisterType(0));
1483 
1484   // FIXME: Clamp maximum size
1485   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1486     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1487     .clampMaxNumElements(0, S32, 32)
1488     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1489     .clampMaxNumElements(0, S16, 64);
1490 
1491   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1492   // pre-legalize.
1493   if (ST.hasVOP3PInsts()) {
1494     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1495       .customFor({V2S16, V2S16})
1496       .lower();
1497   } else
1498     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1499 
1500   // Merge/Unmerge
1501   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1502     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1503     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1504 
1505     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1506       const LLT Ty = Query.Types[TypeIdx];
1507       if (Ty.isVector()) {
1508         const LLT &EltTy = Ty.getElementType();
1509         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1510           return true;
1511         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1512           return true;
1513       }
1514       return false;
1515     };
1516 
1517     auto &Builder = getActionDefinitionsBuilder(Op)
1518       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1519       .lowerFor({{S16, V2S16}})
1520       .lowerIf([=](const LegalityQuery &Query) {
1521           const LLT BigTy = Query.Types[BigTyIdx];
1522           return BigTy.getSizeInBits() == 32;
1523         })
1524       // Try to widen to s16 first for small types.
1525       // TODO: Only do this on targets with legal s16 shifts
1526       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1527       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1528       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1529       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1530                            elementTypeIs(1, S16)),
1531                        changeTo(1, V2S16))
1532       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1533       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1534       // valid.
1535       .clampScalar(LitTyIdx, S32, S512)
1536       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1537       // Break up vectors with weird elements into scalars
1538       .fewerElementsIf(
1539         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1540         scalarize(0))
1541       .fewerElementsIf(
1542         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1543         scalarize(1))
1544       .clampScalar(BigTyIdx, S32, MaxScalar);
1545 
1546     if (Op == G_MERGE_VALUES) {
1547       Builder.widenScalarIf(
1548         // TODO: Use 16-bit shifts if legal for 8-bit values?
1549         [=](const LegalityQuery &Query) {
1550           const LLT Ty = Query.Types[LitTyIdx];
1551           return Ty.getSizeInBits() < 32;
1552         },
1553         changeTo(LitTyIdx, S32));
1554     }
1555 
1556     Builder.widenScalarIf(
1557       [=](const LegalityQuery &Query) {
1558         const LLT Ty = Query.Types[BigTyIdx];
1559         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1560           Ty.getSizeInBits() % 16 != 0;
1561       },
1562       [=](const LegalityQuery &Query) {
1563         // Pick the next power of 2, or a multiple of 64 over 128.
1564         // Whichever is smaller.
1565         const LLT &Ty = Query.Types[BigTyIdx];
1566         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1567         if (NewSizeInBits >= 256) {
1568           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1569           if (RoundedTo < NewSizeInBits)
1570             NewSizeInBits = RoundedTo;
1571         }
1572         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1573       })
1574       // Any vectors left are the wrong size. Scalarize them.
1575       .scalarize(0)
1576       .scalarize(1);
1577   }
1578 
1579   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1580   // RegBankSelect.
1581   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1582     .legalFor({{S32}, {S64}});
1583 
1584   if (ST.hasVOP3PInsts()) {
1585     SextInReg.lowerFor({{V2S16}})
1586       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1587       // get more vector shift opportunities, since we'll get those when
1588       // expanded.
1589       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1590   } else if (ST.has16BitInsts()) {
1591     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1592   } else {
1593     // Prefer to promote to s32 before lowering if we don't have 16-bit
1594     // shifts. This avoid a lot of intermediate truncate and extend operations.
1595     SextInReg.lowerFor({{S32}, {S64}});
1596   }
1597 
1598   SextInReg
1599     .scalarize(0)
1600     .clampScalar(0, S32, S64)
1601     .lower();
1602 
1603   // TODO: Only Try to form v2s16 with legal packed instructions.
1604   getActionDefinitionsBuilder(G_FSHR)
1605     .legalFor({{S32, S32}})
1606     .lowerFor({{V2S16, V2S16}})
1607     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1608     .scalarize(0)
1609     .lower();
1610 
1611   if (ST.hasVOP3PInsts()) {
1612     getActionDefinitionsBuilder(G_FSHL)
1613       .lowerFor({{V2S16, V2S16}})
1614       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1615       .scalarize(0)
1616       .lower();
1617   } else {
1618     getActionDefinitionsBuilder(G_FSHL)
1619       .scalarize(0)
1620       .lower();
1621   }
1622 
1623   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1624     .legalFor({S64});
1625 
1626   getActionDefinitionsBuilder(G_FENCE)
1627     .alwaysLegal();
1628 
1629   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1630       .scalarize(0)
1631       .minScalar(0, S32)
1632       .lower();
1633 
1634   getActionDefinitionsBuilder({
1635       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1636       G_FCOPYSIGN,
1637 
1638       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1639       G_ATOMICRMW_NAND,
1640       G_ATOMICRMW_FSUB,
1641       G_READ_REGISTER,
1642       G_WRITE_REGISTER,
1643 
1644       G_SADDO, G_SSUBO,
1645 
1646        // TODO: Implement
1647       G_FMINIMUM, G_FMAXIMUM}).lower();
1648 
1649   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1650         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1651         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1652     .unsupported();
1653 
1654   computeTables();
1655   verify(*ST.getInstrInfo());
1656 }
1657 
1658 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1659                                          MachineInstr &MI) const {
1660   MachineIRBuilder &B = Helper.MIRBuilder;
1661   MachineRegisterInfo &MRI = *B.getMRI();
1662 
1663   switch (MI.getOpcode()) {
1664   case TargetOpcode::G_ADDRSPACE_CAST:
1665     return legalizeAddrSpaceCast(MI, MRI, B);
1666   case TargetOpcode::G_FRINT:
1667     return legalizeFrint(MI, MRI, B);
1668   case TargetOpcode::G_FCEIL:
1669     return legalizeFceil(MI, MRI, B);
1670   case TargetOpcode::G_FREM:
1671     return legalizeFrem(MI, MRI, B);
1672   case TargetOpcode::G_INTRINSIC_TRUNC:
1673     return legalizeIntrinsicTrunc(MI, MRI, B);
1674   case TargetOpcode::G_SITOFP:
1675     return legalizeITOFP(MI, MRI, B, true);
1676   case TargetOpcode::G_UITOFP:
1677     return legalizeITOFP(MI, MRI, B, false);
1678   case TargetOpcode::G_FPTOSI:
1679     return legalizeFPTOI(MI, MRI, B, true);
1680   case TargetOpcode::G_FPTOUI:
1681     return legalizeFPTOI(MI, MRI, B, false);
1682   case TargetOpcode::G_FMINNUM:
1683   case TargetOpcode::G_FMAXNUM:
1684   case TargetOpcode::G_FMINNUM_IEEE:
1685   case TargetOpcode::G_FMAXNUM_IEEE:
1686     return legalizeMinNumMaxNum(Helper, MI);
1687   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1688     return legalizeExtractVectorElt(MI, MRI, B);
1689   case TargetOpcode::G_INSERT_VECTOR_ELT:
1690     return legalizeInsertVectorElt(MI, MRI, B);
1691   case TargetOpcode::G_SHUFFLE_VECTOR:
1692     return legalizeShuffleVector(MI, MRI, B);
1693   case TargetOpcode::G_FSIN:
1694   case TargetOpcode::G_FCOS:
1695     return legalizeSinCos(MI, MRI, B);
1696   case TargetOpcode::G_GLOBAL_VALUE:
1697     return legalizeGlobalValue(MI, MRI, B);
1698   case TargetOpcode::G_LOAD:
1699     return legalizeLoad(Helper, MI);
1700   case TargetOpcode::G_FMAD:
1701     return legalizeFMad(MI, MRI, B);
1702   case TargetOpcode::G_FDIV:
1703     return legalizeFDIV(MI, MRI, B);
1704   case TargetOpcode::G_UDIV:
1705   case TargetOpcode::G_UREM:
1706     return legalizeUDIV_UREM(MI, MRI, B);
1707   case TargetOpcode::G_SDIV:
1708   case TargetOpcode::G_SREM:
1709     return legalizeSDIV_SREM(MI, MRI, B);
1710   case TargetOpcode::G_ATOMIC_CMPXCHG:
1711     return legalizeAtomicCmpXChg(MI, MRI, B);
1712   case TargetOpcode::G_FLOG:
1713     return legalizeFlog(MI, B, numbers::ln2f);
1714   case TargetOpcode::G_FLOG10:
1715     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1716   case TargetOpcode::G_FEXP:
1717     return legalizeFExp(MI, B);
1718   case TargetOpcode::G_FPOW:
1719     return legalizeFPow(MI, B);
1720   case TargetOpcode::G_FFLOOR:
1721     return legalizeFFloor(MI, MRI, B);
1722   case TargetOpcode::G_BUILD_VECTOR:
1723     return legalizeBuildVector(MI, MRI, B);
1724   default:
1725     return false;
1726   }
1727 
1728   llvm_unreachable("expected switch to return");
1729 }
1730 
1731 Register AMDGPULegalizerInfo::getSegmentAperture(
1732   unsigned AS,
1733   MachineRegisterInfo &MRI,
1734   MachineIRBuilder &B) const {
1735   MachineFunction &MF = B.getMF();
1736   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1737   const LLT S32 = LLT::scalar(32);
1738 
1739   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1740 
1741   if (ST.hasApertureRegs()) {
1742     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1743     // getreg.
1744     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1745         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1746         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1747     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1748         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1749         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1750     unsigned Encoding =
1751         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1752         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1753         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1754 
1755     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1756 
1757     B.buildInstr(AMDGPU::S_GETREG_B32)
1758       .addDef(GetReg)
1759       .addImm(Encoding);
1760     MRI.setType(GetReg, S32);
1761 
1762     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1763     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1764   }
1765 
1766   Register QueuePtr = MRI.createGenericVirtualRegister(
1767     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1768 
1769   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1770     return Register();
1771 
1772   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1773   // private_segment_aperture_base_hi.
1774   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1775 
1776   // TODO: can we be smarter about machine pointer info?
1777   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1778   MachineMemOperand *MMO = MF.getMachineMemOperand(
1779       PtrInfo,
1780       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1781           MachineMemOperand::MOInvariant,
1782       4, commonAlignment(Align(64), StructOffset));
1783 
1784   Register LoadAddr;
1785 
1786   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1787   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1788 }
1789 
1790 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1791   MachineInstr &MI, MachineRegisterInfo &MRI,
1792   MachineIRBuilder &B) const {
1793   MachineFunction &MF = B.getMF();
1794 
1795   const LLT S32 = LLT::scalar(32);
1796   Register Dst = MI.getOperand(0).getReg();
1797   Register Src = MI.getOperand(1).getReg();
1798 
1799   LLT DstTy = MRI.getType(Dst);
1800   LLT SrcTy = MRI.getType(Src);
1801   unsigned DestAS = DstTy.getAddressSpace();
1802   unsigned SrcAS = SrcTy.getAddressSpace();
1803 
1804   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1805   // vector element.
1806   assert(!DstTy.isVector());
1807 
1808   const AMDGPUTargetMachine &TM
1809     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1810 
1811   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1812     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1813     return true;
1814   }
1815 
1816   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1817     // Truncate.
1818     B.buildExtract(Dst, Src, 0);
1819     MI.eraseFromParent();
1820     return true;
1821   }
1822 
1823   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1824     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1825     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1826 
1827     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1828     // another. Merge operands are required to be the same type, but creating an
1829     // extra ptrtoint would be kind of pointless.
1830     auto HighAddr = B.buildConstant(
1831       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1832     B.buildMerge(Dst, {Src, HighAddr});
1833     MI.eraseFromParent();
1834     return true;
1835   }
1836 
1837   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1838     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1839            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1840     unsigned NullVal = TM.getNullPointerValue(DestAS);
1841 
1842     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1843     auto FlatNull = B.buildConstant(SrcTy, 0);
1844 
1845     // Extract low 32-bits of the pointer.
1846     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1847 
1848     auto CmpRes =
1849         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1850     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1851 
1852     MI.eraseFromParent();
1853     return true;
1854   }
1855 
1856   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1857     return false;
1858 
1859   if (!ST.hasFlatAddressSpace())
1860     return false;
1861 
1862   auto SegmentNull =
1863       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1864   auto FlatNull =
1865       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1866 
1867   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1868   if (!ApertureReg.isValid())
1869     return false;
1870 
1871   auto CmpRes =
1872       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1873 
1874   // Coerce the type of the low half of the result so we can use merge_values.
1875   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1876 
1877   // TODO: Should we allow mismatched types but matching sizes in merges to
1878   // avoid the ptrtoint?
1879   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1880   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1881 
1882   MI.eraseFromParent();
1883   return true;
1884 }
1885 
1886 bool AMDGPULegalizerInfo::legalizeFrint(
1887   MachineInstr &MI, MachineRegisterInfo &MRI,
1888   MachineIRBuilder &B) const {
1889   Register Src = MI.getOperand(1).getReg();
1890   LLT Ty = MRI.getType(Src);
1891   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1892 
1893   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1894   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1895 
1896   auto C1 = B.buildFConstant(Ty, C1Val);
1897   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1898 
1899   // TODO: Should this propagate fast-math-flags?
1900   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1901   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1902 
1903   auto C2 = B.buildFConstant(Ty, C2Val);
1904   auto Fabs = B.buildFAbs(Ty, Src);
1905 
1906   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1907   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1908   MI.eraseFromParent();
1909   return true;
1910 }
1911 
1912 bool AMDGPULegalizerInfo::legalizeFceil(
1913   MachineInstr &MI, MachineRegisterInfo &MRI,
1914   MachineIRBuilder &B) const {
1915 
1916   const LLT S1 = LLT::scalar(1);
1917   const LLT S64 = LLT::scalar(64);
1918 
1919   Register Src = MI.getOperand(1).getReg();
1920   assert(MRI.getType(Src) == S64);
1921 
1922   // result = trunc(src)
1923   // if (src > 0.0 && src != result)
1924   //   result += 1.0
1925 
1926   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1927 
1928   const auto Zero = B.buildFConstant(S64, 0.0);
1929   const auto One = B.buildFConstant(S64, 1.0);
1930   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1931   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1932   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1933   auto Add = B.buildSelect(S64, And, One, Zero);
1934 
1935   // TODO: Should this propagate fast-math-flags?
1936   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1937   return true;
1938 }
1939 
1940 bool AMDGPULegalizerInfo::legalizeFrem(
1941   MachineInstr &MI, MachineRegisterInfo &MRI,
1942   MachineIRBuilder &B) const {
1943     Register DstReg = MI.getOperand(0).getReg();
1944     Register Src0Reg = MI.getOperand(1).getReg();
1945     Register Src1Reg = MI.getOperand(2).getReg();
1946     auto Flags = MI.getFlags();
1947     LLT Ty = MRI.getType(DstReg);
1948 
1949     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1950     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1951     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1952     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1953     MI.eraseFromParent();
1954     return true;
1955 }
1956 
1957 static MachineInstrBuilder extractF64Exponent(Register Hi,
1958                                               MachineIRBuilder &B) {
1959   const unsigned FractBits = 52;
1960   const unsigned ExpBits = 11;
1961   LLT S32 = LLT::scalar(32);
1962 
1963   auto Const0 = B.buildConstant(S32, FractBits - 32);
1964   auto Const1 = B.buildConstant(S32, ExpBits);
1965 
1966   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1967     .addUse(Hi)
1968     .addUse(Const0.getReg(0))
1969     .addUse(Const1.getReg(0));
1970 
1971   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1972 }
1973 
1974 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1975   MachineInstr &MI, MachineRegisterInfo &MRI,
1976   MachineIRBuilder &B) const {
1977   const LLT S1 = LLT::scalar(1);
1978   const LLT S32 = LLT::scalar(32);
1979   const LLT S64 = LLT::scalar(64);
1980 
1981   Register Src = MI.getOperand(1).getReg();
1982   assert(MRI.getType(Src) == S64);
1983 
1984   // TODO: Should this use extract since the low half is unused?
1985   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1986   Register Hi = Unmerge.getReg(1);
1987 
1988   // Extract the upper half, since this is where we will find the sign and
1989   // exponent.
1990   auto Exp = extractF64Exponent(Hi, B);
1991 
1992   const unsigned FractBits = 52;
1993 
1994   // Extract the sign bit.
1995   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1996   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1997 
1998   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1999 
2000   const auto Zero32 = B.buildConstant(S32, 0);
2001 
2002   // Extend back to 64-bits.
2003   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2004 
2005   auto Shr = B.buildAShr(S64, FractMask, Exp);
2006   auto Not = B.buildNot(S64, Shr);
2007   auto Tmp0 = B.buildAnd(S64, Src, Not);
2008   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2009 
2010   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2011   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2012 
2013   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2014   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2015   MI.eraseFromParent();
2016   return true;
2017 }
2018 
2019 bool AMDGPULegalizerInfo::legalizeITOFP(
2020   MachineInstr &MI, MachineRegisterInfo &MRI,
2021   MachineIRBuilder &B, bool Signed) const {
2022 
2023   Register Dst = MI.getOperand(0).getReg();
2024   Register Src = MI.getOperand(1).getReg();
2025 
2026   const LLT S64 = LLT::scalar(64);
2027   const LLT S32 = LLT::scalar(32);
2028 
2029   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2030 
2031   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2032 
2033   auto CvtHi = Signed ?
2034     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2035     B.buildUITOFP(S64, Unmerge.getReg(1));
2036 
2037   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2038 
2039   auto ThirtyTwo = B.buildConstant(S32, 32);
2040   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2041     .addUse(CvtHi.getReg(0))
2042     .addUse(ThirtyTwo.getReg(0));
2043 
2044   // TODO: Should this propagate fast-math-flags?
2045   B.buildFAdd(Dst, LdExp, CvtLo);
2046   MI.eraseFromParent();
2047   return true;
2048 }
2049 
2050 // TODO: Copied from DAG implementation. Verify logic and document how this
2051 // actually works.
2052 bool AMDGPULegalizerInfo::legalizeFPTOI(
2053   MachineInstr &MI, MachineRegisterInfo &MRI,
2054   MachineIRBuilder &B, bool Signed) const {
2055 
2056   Register Dst = MI.getOperand(0).getReg();
2057   Register Src = MI.getOperand(1).getReg();
2058 
2059   const LLT S64 = LLT::scalar(64);
2060   const LLT S32 = LLT::scalar(32);
2061 
2062   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2063 
2064   unsigned Flags = MI.getFlags();
2065 
2066   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2067   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2068   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2069 
2070   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2071   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2072   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2073 
2074   auto Hi = Signed ?
2075     B.buildFPTOSI(S32, FloorMul) :
2076     B.buildFPTOUI(S32, FloorMul);
2077   auto Lo = B.buildFPTOUI(S32, Fma);
2078 
2079   B.buildMerge(Dst, { Lo, Hi });
2080   MI.eraseFromParent();
2081 
2082   return true;
2083 }
2084 
2085 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2086                                                MachineInstr &MI) const {
2087   MachineFunction &MF = Helper.MIRBuilder.getMF();
2088   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2089 
2090   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2091                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2092 
2093   // With ieee_mode disabled, the instructions have the correct behavior
2094   // already for G_FMINNUM/G_FMAXNUM
2095   if (!MFI->getMode().IEEE)
2096     return !IsIEEEOp;
2097 
2098   if (IsIEEEOp)
2099     return true;
2100 
2101   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2105   MachineInstr &MI, MachineRegisterInfo &MRI,
2106   MachineIRBuilder &B) const {
2107   // TODO: Should move some of this into LegalizerHelper.
2108 
2109   // TODO: Promote dynamic indexing of s16 to s32
2110 
2111   // FIXME: Artifact combiner probably should have replaced the truncated
2112   // constant before this, so we shouldn't need
2113   // getConstantVRegValWithLookThrough.
2114   Optional<ValueAndVReg> MaybeIdxVal =
2115       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2116   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2117     return true;
2118   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2119 
2120   Register Dst = MI.getOperand(0).getReg();
2121   Register Vec = MI.getOperand(1).getReg();
2122 
2123   LLT VecTy = MRI.getType(Vec);
2124   LLT EltTy = VecTy.getElementType();
2125   assert(EltTy == MRI.getType(Dst));
2126 
2127   if (IdxVal < VecTy.getNumElements())
2128     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2129   else
2130     B.buildUndef(Dst);
2131 
2132   MI.eraseFromParent();
2133   return true;
2134 }
2135 
2136 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2137   MachineInstr &MI, MachineRegisterInfo &MRI,
2138   MachineIRBuilder &B) const {
2139   // TODO: Should move some of this into LegalizerHelper.
2140 
2141   // TODO: Promote dynamic indexing of s16 to s32
2142 
2143   // FIXME: Artifact combiner probably should have replaced the truncated
2144   // constant before this, so we shouldn't need
2145   // getConstantVRegValWithLookThrough.
2146   Optional<ValueAndVReg> MaybeIdxVal =
2147       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2148   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2149     return true;
2150 
2151   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2152   Register Dst = MI.getOperand(0).getReg();
2153   Register Vec = MI.getOperand(1).getReg();
2154   Register Ins = MI.getOperand(2).getReg();
2155 
2156   LLT VecTy = MRI.getType(Vec);
2157   LLT EltTy = VecTy.getElementType();
2158   assert(EltTy == MRI.getType(Ins));
2159 
2160   if (IdxVal < VecTy.getNumElements())
2161     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2162   else
2163     B.buildUndef(Dst);
2164 
2165   MI.eraseFromParent();
2166   return true;
2167 }
2168 
2169 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2170   MachineInstr &MI, MachineRegisterInfo &MRI,
2171   MachineIRBuilder &B) const {
2172   const LLT V2S16 = LLT::vector(2, 16);
2173 
2174   Register Dst = MI.getOperand(0).getReg();
2175   Register Src0 = MI.getOperand(1).getReg();
2176   LLT DstTy = MRI.getType(Dst);
2177   LLT SrcTy = MRI.getType(Src0);
2178 
2179   if (SrcTy == V2S16 && DstTy == V2S16 &&
2180       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2181     return true;
2182 
2183   MachineIRBuilder HelperBuilder(MI);
2184   GISelObserverWrapper DummyObserver;
2185   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2186   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::legalizeSinCos(
2190   MachineInstr &MI, MachineRegisterInfo &MRI,
2191   MachineIRBuilder &B) const {
2192 
2193   Register DstReg = MI.getOperand(0).getReg();
2194   Register SrcReg = MI.getOperand(1).getReg();
2195   LLT Ty = MRI.getType(DstReg);
2196   unsigned Flags = MI.getFlags();
2197 
2198   Register TrigVal;
2199   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2200   if (ST.hasTrigReducedRange()) {
2201     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2202     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2203       .addUse(MulVal.getReg(0))
2204       .setMIFlags(Flags).getReg(0);
2205   } else
2206     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2207 
2208   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2209     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2210   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2211     .addUse(TrigVal)
2212     .setMIFlags(Flags);
2213   MI.eraseFromParent();
2214   return true;
2215 }
2216 
2217 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2218                                                   MachineIRBuilder &B,
2219                                                   const GlobalValue *GV,
2220                                                   int64_t Offset,
2221                                                   unsigned GAFlags) const {
2222   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2223   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2224   // to the following code sequence:
2225   //
2226   // For constant address space:
2227   //   s_getpc_b64 s[0:1]
2228   //   s_add_u32 s0, s0, $symbol
2229   //   s_addc_u32 s1, s1, 0
2230   //
2231   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2232   //   a fixup or relocation is emitted to replace $symbol with a literal
2233   //   constant, which is a pc-relative offset from the encoding of the $symbol
2234   //   operand to the global variable.
2235   //
2236   // For global address space:
2237   //   s_getpc_b64 s[0:1]
2238   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2239   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2240   //
2241   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2242   //   fixups or relocations are emitted to replace $symbol@*@lo and
2243   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2244   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2245   //   operand to the global variable.
2246   //
2247   // What we want here is an offset from the value returned by s_getpc
2248   // (which is the address of the s_add_u32 instruction) to the global
2249   // variable, but since the encoding of $symbol starts 4 bytes after the start
2250   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2251   // small. This requires us to add 4 to the global variable offset in order to
2252   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2253   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2254   // instruction.
2255 
2256   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2257 
2258   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2259     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2260 
2261   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2262     .addDef(PCReg);
2263 
2264   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2265   if (GAFlags == SIInstrInfo::MO_NONE)
2266     MIB.addImm(0);
2267   else
2268     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2269 
2270   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2271 
2272   if (PtrTy.getSizeInBits() == 32)
2273     B.buildExtract(DstReg, PCReg, 0);
2274   return true;
2275  }
2276 
2277 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2278   MachineInstr &MI, MachineRegisterInfo &MRI,
2279   MachineIRBuilder &B) const {
2280   Register DstReg = MI.getOperand(0).getReg();
2281   LLT Ty = MRI.getType(DstReg);
2282   unsigned AS = Ty.getAddressSpace();
2283 
2284   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2285   MachineFunction &MF = B.getMF();
2286   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2287 
2288   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2289     if (!MFI->isModuleEntryFunction()) {
2290       const Function &Fn = MF.getFunction();
2291       DiagnosticInfoUnsupported BadLDSDecl(
2292         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2293         DS_Warning);
2294       Fn.getContext().diagnose(BadLDSDecl);
2295 
2296       // We currently don't have a way to correctly allocate LDS objects that
2297       // aren't directly associated with a kernel. We do force inlining of
2298       // functions that use local objects. However, if these dead functions are
2299       // not eliminated, we don't want a compile time error. Just emit a warning
2300       // and a trap, since there should be no callable path here.
2301       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2302       B.buildUndef(DstReg);
2303       MI.eraseFromParent();
2304       return true;
2305     }
2306 
2307     // TODO: We could emit code to handle the initialization somewhere.
2308     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2309       const SITargetLowering *TLI = ST.getTargetLowering();
2310       if (!TLI->shouldUseLDSConstAddress(GV)) {
2311         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2312         return true; // Leave in place;
2313       }
2314 
2315       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2316         Type *Ty = GV->getValueType();
2317         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2318         // zero-sized type in other languages to declare the dynamic shared
2319         // memory which size is not known at the compile time. They will be
2320         // allocated by the runtime and placed directly after the static
2321         // allocated ones. They all share the same offset.
2322         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2323           // Adjust alignment for that dynamic shared memory array.
2324           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2325           LLT S32 = LLT::scalar(32);
2326           auto Sz =
2327               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2328           B.buildIntToPtr(DstReg, Sz);
2329           MI.eraseFromParent();
2330           return true;
2331         }
2332       }
2333 
2334       B.buildConstant(
2335           DstReg,
2336           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2337       MI.eraseFromParent();
2338       return true;
2339     }
2340 
2341     const Function &Fn = MF.getFunction();
2342     DiagnosticInfoUnsupported BadInit(
2343       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2344     Fn.getContext().diagnose(BadInit);
2345     return true;
2346   }
2347 
2348   const SITargetLowering *TLI = ST.getTargetLowering();
2349 
2350   if (TLI->shouldEmitFixup(GV)) {
2351     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2352     MI.eraseFromParent();
2353     return true;
2354   }
2355 
2356   if (TLI->shouldEmitPCReloc(GV)) {
2357     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2358     MI.eraseFromParent();
2359     return true;
2360   }
2361 
2362   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2363   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2364 
2365   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2366       MachinePointerInfo::getGOT(MF),
2367       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2368           MachineMemOperand::MOInvariant,
2369       8 /*Size*/, Align(8));
2370 
2371   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2372 
2373   if (Ty.getSizeInBits() == 32) {
2374     // Truncate if this is a 32-bit constant adrdess.
2375     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2376     B.buildExtract(DstReg, Load, 0);
2377   } else
2378     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2379 
2380   MI.eraseFromParent();
2381   return true;
2382 }
2383 
2384 static LLT widenToNextPowerOf2(LLT Ty) {
2385   if (Ty.isVector())
2386     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2387   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2388 }
2389 
2390 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2391                                        MachineInstr &MI) const {
2392   MachineIRBuilder &B = Helper.MIRBuilder;
2393   MachineRegisterInfo &MRI = *B.getMRI();
2394   GISelChangeObserver &Observer = Helper.Observer;
2395 
2396   Register PtrReg = MI.getOperand(1).getReg();
2397   LLT PtrTy = MRI.getType(PtrReg);
2398   unsigned AddrSpace = PtrTy.getAddressSpace();
2399 
2400   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2401     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2402     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2403     Observer.changingInstr(MI);
2404     MI.getOperand(1).setReg(Cast.getReg(0));
2405     Observer.changedInstr(MI);
2406     return true;
2407   }
2408 
2409   Register ValReg = MI.getOperand(0).getReg();
2410   LLT ValTy = MRI.getType(ValReg);
2411 
2412   MachineMemOperand *MMO = *MI.memoperands_begin();
2413   const unsigned ValSize = ValTy.getSizeInBits();
2414   const unsigned MemSize = 8 * MMO->getSize();
2415   const Align MemAlign = MMO->getAlign();
2416   const unsigned AlignInBits = 8 * MemAlign.value();
2417 
2418   // Widen non-power-of-2 loads to the alignment if needed
2419   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2420     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2421 
2422     // This was already the correct extending load result type, so just adjust
2423     // the memory type.
2424     if (WideMemSize == ValSize) {
2425       MachineFunction &MF = B.getMF();
2426 
2427       MachineMemOperand *WideMMO =
2428           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2429       Observer.changingInstr(MI);
2430       MI.setMemRefs(MF, {WideMMO});
2431       Observer.changedInstr(MI);
2432       return true;
2433     }
2434 
2435     // Don't bother handling edge case that should probably never be produced.
2436     if (ValSize > WideMemSize)
2437       return false;
2438 
2439     LLT WideTy = widenToNextPowerOf2(ValTy);
2440 
2441     Register WideLoad;
2442     if (!WideTy.isVector()) {
2443       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2444       B.buildTrunc(ValReg, WideLoad).getReg(0);
2445     } else {
2446       // Extract the subvector.
2447 
2448       if (isRegisterType(ValTy)) {
2449         // If this a case where G_EXTRACT is legal, use it.
2450         // (e.g. <3 x s32> -> <4 x s32>)
2451         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2452         B.buildExtract(ValReg, WideLoad, 0);
2453       } else {
2454         // For cases where the widened type isn't a nice register value, unmerge
2455         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2456         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2457         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2458         B.setInsertPt(B.getMBB(), MI.getIterator());
2459         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2460       }
2461     }
2462 
2463     MI.eraseFromParent();
2464     return true;
2465   }
2466 
2467   return false;
2468 }
2469 
2470 bool AMDGPULegalizerInfo::legalizeFMad(
2471   MachineInstr &MI, MachineRegisterInfo &MRI,
2472   MachineIRBuilder &B) const {
2473   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2474   assert(Ty.isScalar());
2475 
2476   MachineFunction &MF = B.getMF();
2477   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2478 
2479   // TODO: Always legal with future ftz flag.
2480   // FIXME: Do we need just output?
2481   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2482     return true;
2483   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2484     return true;
2485 
2486   MachineIRBuilder HelperBuilder(MI);
2487   GISelObserverWrapper DummyObserver;
2488   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2489   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2490 }
2491 
2492 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2493   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2494   Register DstReg = MI.getOperand(0).getReg();
2495   Register PtrReg = MI.getOperand(1).getReg();
2496   Register CmpVal = MI.getOperand(2).getReg();
2497   Register NewVal = MI.getOperand(3).getReg();
2498 
2499   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2500          "this should not have been custom lowered");
2501 
2502   LLT ValTy = MRI.getType(CmpVal);
2503   LLT VecTy = LLT::vector(2, ValTy);
2504 
2505   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2506 
2507   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2508     .addDef(DstReg)
2509     .addUse(PtrReg)
2510     .addUse(PackedVal)
2511     .setMemRefs(MI.memoperands());
2512 
2513   MI.eraseFromParent();
2514   return true;
2515 }
2516 
2517 bool AMDGPULegalizerInfo::legalizeFlog(
2518   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2519   Register Dst = MI.getOperand(0).getReg();
2520   Register Src = MI.getOperand(1).getReg();
2521   LLT Ty = B.getMRI()->getType(Dst);
2522   unsigned Flags = MI.getFlags();
2523 
2524   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2525   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2526 
2527   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2528   MI.eraseFromParent();
2529   return true;
2530 }
2531 
2532 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2533                                        MachineIRBuilder &B) const {
2534   Register Dst = MI.getOperand(0).getReg();
2535   Register Src = MI.getOperand(1).getReg();
2536   unsigned Flags = MI.getFlags();
2537   LLT Ty = B.getMRI()->getType(Dst);
2538 
2539   auto K = B.buildFConstant(Ty, numbers::log2e);
2540   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2541   B.buildFExp2(Dst, Mul, Flags);
2542   MI.eraseFromParent();
2543   return true;
2544 }
2545 
2546 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2547                                        MachineIRBuilder &B) const {
2548   Register Dst = MI.getOperand(0).getReg();
2549   Register Src0 = MI.getOperand(1).getReg();
2550   Register Src1 = MI.getOperand(2).getReg();
2551   unsigned Flags = MI.getFlags();
2552   LLT Ty = B.getMRI()->getType(Dst);
2553   const LLT S16 = LLT::scalar(16);
2554   const LLT S32 = LLT::scalar(32);
2555 
2556   if (Ty == S32) {
2557     auto Log = B.buildFLog2(S32, Src0, Flags);
2558     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2559       .addUse(Log.getReg(0))
2560       .addUse(Src1)
2561       .setMIFlags(Flags);
2562     B.buildFExp2(Dst, Mul, Flags);
2563   } else if (Ty == S16) {
2564     // There's no f16 fmul_legacy, so we need to convert for it.
2565     auto Log = B.buildFLog2(S16, Src0, Flags);
2566     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2567     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2568     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2569       .addUse(Ext0.getReg(0))
2570       .addUse(Ext1.getReg(0))
2571       .setMIFlags(Flags);
2572 
2573     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2574   } else
2575     return false;
2576 
2577   MI.eraseFromParent();
2578   return true;
2579 }
2580 
2581 // Find a source register, ignoring any possible source modifiers.
2582 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2583   Register ModSrc = OrigSrc;
2584   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2585     ModSrc = SrcFNeg->getOperand(1).getReg();
2586     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2587       ModSrc = SrcFAbs->getOperand(1).getReg();
2588   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2589     ModSrc = SrcFAbs->getOperand(1).getReg();
2590   return ModSrc;
2591 }
2592 
2593 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2594                                          MachineRegisterInfo &MRI,
2595                                          MachineIRBuilder &B) const {
2596 
2597   const LLT S1 = LLT::scalar(1);
2598   const LLT S64 = LLT::scalar(64);
2599   Register Dst = MI.getOperand(0).getReg();
2600   Register OrigSrc = MI.getOperand(1).getReg();
2601   unsigned Flags = MI.getFlags();
2602   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2603          "this should not have been custom lowered");
2604 
2605   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2606   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2607   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2608   // V_FRACT bug is:
2609   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2610   //
2611   // Convert floor(x) to (x - fract(x))
2612 
2613   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2614     .addUse(OrigSrc)
2615     .setMIFlags(Flags);
2616 
2617   // Give source modifier matching some assistance before obscuring a foldable
2618   // pattern.
2619 
2620   // TODO: We can avoid the neg on the fract? The input sign to fract
2621   // shouldn't matter?
2622   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2623 
2624   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2625 
2626   Register Min = MRI.createGenericVirtualRegister(S64);
2627 
2628   // We don't need to concern ourselves with the snan handling difference, so
2629   // use the one which will directly select.
2630   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2631   if (MFI->getMode().IEEE)
2632     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2633   else
2634     B.buildFMinNum(Min, Fract, Const, Flags);
2635 
2636   Register CorrectedFract = Min;
2637   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2638     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2639     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2640   }
2641 
2642   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2643   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2644 
2645   MI.eraseFromParent();
2646   return true;
2647 }
2648 
2649 // Turn an illegal packed v2s16 build vector into bit operations.
2650 // TODO: This should probably be a bitcast action in LegalizerHelper.
2651 bool AMDGPULegalizerInfo::legalizeBuildVector(
2652   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2653   Register Dst = MI.getOperand(0).getReg();
2654   const LLT S32 = LLT::scalar(32);
2655   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2656 
2657   Register Src0 = MI.getOperand(1).getReg();
2658   Register Src1 = MI.getOperand(2).getReg();
2659   assert(MRI.getType(Src0) == LLT::scalar(16));
2660 
2661   auto Merge = B.buildMerge(S32, {Src0, Src1});
2662   B.buildBitcast(Dst, Merge);
2663 
2664   MI.eraseFromParent();
2665   return true;
2666 }
2667 
2668 // Check that this is a G_XOR x, -1
2669 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2670   if (MI.getOpcode() != TargetOpcode::G_XOR)
2671     return false;
2672   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2673   return ConstVal && *ConstVal == -1;
2674 }
2675 
2676 // Return the use branch instruction, otherwise null if the usage is invalid.
2677 static MachineInstr *
2678 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2679                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2680   Register CondDef = MI.getOperand(0).getReg();
2681   if (!MRI.hasOneNonDBGUse(CondDef))
2682     return nullptr;
2683 
2684   MachineBasicBlock *Parent = MI.getParent();
2685   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2686 
2687   if (isNot(MRI, *UseMI)) {
2688     Register NegatedCond = UseMI->getOperand(0).getReg();
2689     if (!MRI.hasOneNonDBGUse(NegatedCond))
2690       return nullptr;
2691 
2692     // We're deleting the def of this value, so we need to remove it.
2693     UseMI->eraseFromParent();
2694 
2695     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2696     Negated = true;
2697   }
2698 
2699   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2700     return nullptr;
2701 
2702   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2703   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2704   if (Next == Parent->end()) {
2705     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2706     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2707       return nullptr;
2708     UncondBrTarget = &*NextMBB;
2709   } else {
2710     if (Next->getOpcode() != AMDGPU::G_BR)
2711       return nullptr;
2712     Br = &*Next;
2713     UncondBrTarget = Br->getOperand(0).getMBB();
2714   }
2715 
2716   return UseMI;
2717 }
2718 
2719 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2720                                          const ArgDescriptor *Arg,
2721                                          const TargetRegisterClass *ArgRC,
2722                                          LLT ArgTy) const {
2723   MCRegister SrcReg = Arg->getRegister();
2724   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2725   assert(DstReg.isVirtual() && "Virtual register expected");
2726 
2727   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2728                                              ArgTy);
2729   if (Arg->isMasked()) {
2730     // TODO: Should we try to emit this once in the entry block?
2731     const LLT S32 = LLT::scalar(32);
2732     const unsigned Mask = Arg->getMask();
2733     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2734 
2735     Register AndMaskSrc = LiveIn;
2736 
2737     if (Shift != 0) {
2738       auto ShiftAmt = B.buildConstant(S32, Shift);
2739       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2740     }
2741 
2742     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2743   } else {
2744     B.buildCopy(DstReg, LiveIn);
2745   }
2746 
2747   return true;
2748 }
2749 
2750 bool AMDGPULegalizerInfo::loadInputValue(
2751     Register DstReg, MachineIRBuilder &B,
2752     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2753   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2754   const ArgDescriptor *Arg;
2755   const TargetRegisterClass *ArgRC;
2756   LLT ArgTy;
2757   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2758 
2759   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2760     return false; // TODO: Handle these
2761   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2762 }
2763 
2764 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2765     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2766     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2767   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2768     return false;
2769 
2770   MI.eraseFromParent();
2771   return true;
2772 }
2773 
2774 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2775                                        MachineRegisterInfo &MRI,
2776                                        MachineIRBuilder &B) const {
2777   Register Dst = MI.getOperand(0).getReg();
2778   LLT DstTy = MRI.getType(Dst);
2779   LLT S16 = LLT::scalar(16);
2780   LLT S32 = LLT::scalar(32);
2781   LLT S64 = LLT::scalar(64);
2782 
2783   if (DstTy == S16)
2784     return legalizeFDIV16(MI, MRI, B);
2785   if (DstTy == S32)
2786     return legalizeFDIV32(MI, MRI, B);
2787   if (DstTy == S64)
2788     return legalizeFDIV64(MI, MRI, B);
2789 
2790   return false;
2791 }
2792 
2793 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2794                                                   Register DstReg,
2795                                                   Register X,
2796                                                   Register Y,
2797                                                   bool IsDiv) const {
2798   const LLT S1 = LLT::scalar(1);
2799   const LLT S32 = LLT::scalar(32);
2800 
2801   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2802   // algorithm used here.
2803 
2804   // Initial estimate of inv(y).
2805   auto FloatY = B.buildUITOFP(S32, Y);
2806   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2807   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2808   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2809   auto Z = B.buildFPTOUI(S32, ScaledY);
2810 
2811   // One round of UNR.
2812   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2813   auto NegYZ = B.buildMul(S32, NegY, Z);
2814   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2815 
2816   // Quotient/remainder estimate.
2817   auto Q = B.buildUMulH(S32, X, Z);
2818   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2819 
2820   // First quotient/remainder refinement.
2821   auto One = B.buildConstant(S32, 1);
2822   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2823   if (IsDiv)
2824     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2825   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2826 
2827   // Second quotient/remainder refinement.
2828   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2829   if (IsDiv)
2830     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2831   else
2832     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2833 }
2834 
2835 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2836 //
2837 // Return lo, hi of result
2838 //
2839 // %cvt.lo = G_UITOFP Val.lo
2840 // %cvt.hi = G_UITOFP Val.hi
2841 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2842 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2843 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2844 // %mul2 = G_FMUL %mul1, 2**(-32)
2845 // %trunc = G_INTRINSIC_TRUNC %mul2
2846 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2847 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2848 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2849                                                        Register Val) {
2850   const LLT S32 = LLT::scalar(32);
2851   auto Unmerge = B.buildUnmerge(S32, Val);
2852 
2853   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2854   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2855 
2856   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2857                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2858 
2859   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2860   auto Mul1 =
2861       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2862 
2863   // 2**(-32)
2864   auto Mul2 =
2865       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2866   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2867 
2868   // -(2**32)
2869   auto Mad2 = B.buildFMAD(S32, Trunc,
2870                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2871 
2872   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2873   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2874 
2875   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2876 }
2877 
2878 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2879                                                   Register DstReg,
2880                                                   Register Numer,
2881                                                   Register Denom,
2882                                                   bool IsDiv) const {
2883   const LLT S32 = LLT::scalar(32);
2884   const LLT S64 = LLT::scalar(64);
2885   const LLT S1 = LLT::scalar(1);
2886   Register RcpLo, RcpHi;
2887 
2888   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2889 
2890   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2891 
2892   auto Zero64 = B.buildConstant(S64, 0);
2893   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2894 
2895   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2896   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2897 
2898   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2899   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2900   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2901 
2902   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2903   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2904   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2905   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2906 
2907   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2908   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2909   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2910   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2911   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2912 
2913   auto Zero32 = B.buildConstant(S32, 0);
2914   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2915   auto Add2_HiC =
2916       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2917   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2918   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2919 
2920   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2921   Register NumerLo = UnmergeNumer.getReg(0);
2922   Register NumerHi = UnmergeNumer.getReg(1);
2923 
2924   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2925   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2926   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2927   Register Mul3_Lo = UnmergeMul3.getReg(0);
2928   Register Mul3_Hi = UnmergeMul3.getReg(1);
2929   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2930   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2931   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2932   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2933 
2934   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2935   Register DenomLo = UnmergeDenom.getReg(0);
2936   Register DenomHi = UnmergeDenom.getReg(1);
2937 
2938   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2939   auto C1 = B.buildSExt(S32, CmpHi);
2940 
2941   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2942   auto C2 = B.buildSExt(S32, CmpLo);
2943 
2944   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2945   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2946 
2947   // TODO: Here and below portions of the code can be enclosed into if/endif.
2948   // Currently control flow is unconditional and we have 4 selects after
2949   // potential endif to substitute PHIs.
2950 
2951   // if C3 != 0 ...
2952   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2953   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2954   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2955   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2956 
2957   auto One64 = B.buildConstant(S64, 1);
2958   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2959 
2960   auto C4 =
2961       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2962   auto C5 =
2963       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2964   auto C6 = B.buildSelect(
2965       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2966 
2967   // if (C6 != 0)
2968   auto Add4 = B.buildAdd(S64, Add3, One64);
2969   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2970 
2971   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2972   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2973   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2974 
2975   // endif C6
2976   // endif C3
2977 
2978   if (IsDiv) {
2979     auto Sel1 = B.buildSelect(
2980         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2981     B.buildSelect(DstReg,
2982                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2983   } else {
2984     auto Sel2 = B.buildSelect(
2985         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2986     B.buildSelect(DstReg,
2987                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2988   }
2989 }
2990 
2991 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2992                                             MachineRegisterInfo &MRI,
2993                                             MachineIRBuilder &B) const {
2994   const LLT S64 = LLT::scalar(64);
2995   const LLT S32 = LLT::scalar(32);
2996   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2997   Register DstReg = MI.getOperand(0).getReg();
2998   Register Num = MI.getOperand(1).getReg();
2999   Register Den = MI.getOperand(2).getReg();
3000   LLT Ty = MRI.getType(DstReg);
3001 
3002   if (Ty == S32)
3003     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3004   else if (Ty == S64)
3005     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3006   else
3007     return false;
3008 
3009   MI.eraseFromParent();
3010   return true;
3011 
3012 }
3013 
3014 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
3015                                             MachineRegisterInfo &MRI,
3016                                             MachineIRBuilder &B) const {
3017   const LLT S64 = LLT::scalar(64);
3018   const LLT S32 = LLT::scalar(32);
3019 
3020   Register DstReg = MI.getOperand(0).getReg();
3021   const LLT Ty = MRI.getType(DstReg);
3022   if (Ty != S32 && Ty != S64)
3023     return false;
3024 
3025   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3026 
3027   Register LHS = MI.getOperand(1).getReg();
3028   Register RHS = MI.getOperand(2).getReg();
3029 
3030   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3031   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3032   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3033 
3034   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3035   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3036 
3037   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3038   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3039 
3040   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3041   if (Ty == S32)
3042     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3043   else
3044     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3045 
3046   Register Sign;
3047   if (IsDiv)
3048     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3049   else
3050     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3051 
3052   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3053   B.buildSub(DstReg, UDivRem, Sign);
3054 
3055   MI.eraseFromParent();
3056   return true;
3057 }
3058 
3059 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3060                                                  MachineRegisterInfo &MRI,
3061                                                  MachineIRBuilder &B) const {
3062   Register Res = MI.getOperand(0).getReg();
3063   Register LHS = MI.getOperand(1).getReg();
3064   Register RHS = MI.getOperand(2).getReg();
3065   uint16_t Flags = MI.getFlags();
3066   LLT ResTy = MRI.getType(Res);
3067 
3068   const MachineFunction &MF = B.getMF();
3069   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3070                             MI.getFlag(MachineInstr::FmAfn);
3071 
3072   if (!AllowInaccurateRcp)
3073     return false;
3074 
3075   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3076     // 1 / x -> RCP(x)
3077     if (CLHS->isExactlyValue(1.0)) {
3078       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3079         .addUse(RHS)
3080         .setMIFlags(Flags);
3081 
3082       MI.eraseFromParent();
3083       return true;
3084     }
3085 
3086     // -1 / x -> RCP( FNEG(x) )
3087     if (CLHS->isExactlyValue(-1.0)) {
3088       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3089       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3090         .addUse(FNeg.getReg(0))
3091         .setMIFlags(Flags);
3092 
3093       MI.eraseFromParent();
3094       return true;
3095     }
3096   }
3097 
3098   // x / y -> x * (1.0 / y)
3099   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3100     .addUse(RHS)
3101     .setMIFlags(Flags);
3102   B.buildFMul(Res, LHS, RCP, Flags);
3103 
3104   MI.eraseFromParent();
3105   return true;
3106 }
3107 
3108 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3109                                                    MachineRegisterInfo &MRI,
3110                                                    MachineIRBuilder &B) const {
3111   Register Res = MI.getOperand(0).getReg();
3112   Register X = MI.getOperand(1).getReg();
3113   Register Y = MI.getOperand(2).getReg();
3114   uint16_t Flags = MI.getFlags();
3115   LLT ResTy = MRI.getType(Res);
3116 
3117   const MachineFunction &MF = B.getMF();
3118   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3119                             MI.getFlag(MachineInstr::FmAfn);
3120 
3121   if (!AllowInaccurateRcp)
3122     return false;
3123 
3124   auto NegY = B.buildFNeg(ResTy, Y);
3125   auto One = B.buildFConstant(ResTy, 1.0);
3126 
3127   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3128     .addUse(Y)
3129     .setMIFlags(Flags);
3130 
3131   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3132   R = B.buildFMA(ResTy, Tmp0, R, R);
3133 
3134   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3135   R = B.buildFMA(ResTy, Tmp1, R, R);
3136 
3137   auto Ret = B.buildFMul(ResTy, X, R);
3138   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3139 
3140   B.buildFMA(Res, Tmp2, R, Ret);
3141   MI.eraseFromParent();
3142   return true;
3143 }
3144 
3145 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3146                                          MachineRegisterInfo &MRI,
3147                                          MachineIRBuilder &B) const {
3148   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3149     return true;
3150 
3151   Register Res = MI.getOperand(0).getReg();
3152   Register LHS = MI.getOperand(1).getReg();
3153   Register RHS = MI.getOperand(2).getReg();
3154 
3155   uint16_t Flags = MI.getFlags();
3156 
3157   LLT S16 = LLT::scalar(16);
3158   LLT S32 = LLT::scalar(32);
3159 
3160   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3161   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3162 
3163   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3164     .addUse(RHSExt.getReg(0))
3165     .setMIFlags(Flags);
3166 
3167   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3168   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3169 
3170   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3171     .addUse(RDst.getReg(0))
3172     .addUse(RHS)
3173     .addUse(LHS)
3174     .setMIFlags(Flags);
3175 
3176   MI.eraseFromParent();
3177   return true;
3178 }
3179 
3180 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3181 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3182 static void toggleSPDenormMode(bool Enable,
3183                                MachineIRBuilder &B,
3184                                const GCNSubtarget &ST,
3185                                AMDGPU::SIModeRegisterDefaults Mode) {
3186   // Set SP denorm mode to this value.
3187   unsigned SPDenormMode =
3188     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3189 
3190   if (ST.hasDenormModeInst()) {
3191     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3192     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3193 
3194     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3195     B.buildInstr(AMDGPU::S_DENORM_MODE)
3196       .addImm(NewDenormModeValue);
3197 
3198   } else {
3199     // Select FP32 bit field in mode register.
3200     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3201                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3202                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3203 
3204     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3205       .addImm(SPDenormMode)
3206       .addImm(SPDenormModeBitField);
3207   }
3208 }
3209 
3210 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3211                                          MachineRegisterInfo &MRI,
3212                                          MachineIRBuilder &B) const {
3213   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3214     return true;
3215 
3216   Register Res = MI.getOperand(0).getReg();
3217   Register LHS = MI.getOperand(1).getReg();
3218   Register RHS = MI.getOperand(2).getReg();
3219   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3220   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3221 
3222   uint16_t Flags = MI.getFlags();
3223 
3224   LLT S32 = LLT::scalar(32);
3225   LLT S1 = LLT::scalar(1);
3226 
3227   auto One = B.buildFConstant(S32, 1.0f);
3228 
3229   auto DenominatorScaled =
3230     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3231       .addUse(LHS)
3232       .addUse(RHS)
3233       .addImm(0)
3234       .setMIFlags(Flags);
3235   auto NumeratorScaled =
3236     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3237       .addUse(LHS)
3238       .addUse(RHS)
3239       .addImm(1)
3240       .setMIFlags(Flags);
3241 
3242   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3243     .addUse(DenominatorScaled.getReg(0))
3244     .setMIFlags(Flags);
3245   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3246 
3247   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3248   // aren't modeled as reading it.
3249   if (!Mode.allFP32Denormals())
3250     toggleSPDenormMode(true, B, ST, Mode);
3251 
3252   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3253   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3254   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3255   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3256   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3257   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3258 
3259   if (!Mode.allFP32Denormals())
3260     toggleSPDenormMode(false, B, ST, Mode);
3261 
3262   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3263     .addUse(Fma4.getReg(0))
3264     .addUse(Fma1.getReg(0))
3265     .addUse(Fma3.getReg(0))
3266     .addUse(NumeratorScaled.getReg(1))
3267     .setMIFlags(Flags);
3268 
3269   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3270     .addUse(Fmas.getReg(0))
3271     .addUse(RHS)
3272     .addUse(LHS)
3273     .setMIFlags(Flags);
3274 
3275   MI.eraseFromParent();
3276   return true;
3277 }
3278 
3279 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3280                                          MachineRegisterInfo &MRI,
3281                                          MachineIRBuilder &B) const {
3282   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3283     return true;
3284 
3285   Register Res = MI.getOperand(0).getReg();
3286   Register LHS = MI.getOperand(1).getReg();
3287   Register RHS = MI.getOperand(2).getReg();
3288 
3289   uint16_t Flags = MI.getFlags();
3290 
3291   LLT S64 = LLT::scalar(64);
3292   LLT S1 = LLT::scalar(1);
3293 
3294   auto One = B.buildFConstant(S64, 1.0);
3295 
3296   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297     .addUse(LHS)
3298     .addUse(RHS)
3299     .addImm(0)
3300     .setMIFlags(Flags);
3301 
3302   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3303 
3304   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3305     .addUse(DivScale0.getReg(0))
3306     .setMIFlags(Flags);
3307 
3308   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3309   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3310   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3311 
3312   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3313     .addUse(LHS)
3314     .addUse(RHS)
3315     .addImm(1)
3316     .setMIFlags(Flags);
3317 
3318   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3319   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3320   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3321 
3322   Register Scale;
3323   if (!ST.hasUsableDivScaleConditionOutput()) {
3324     // Workaround a hardware bug on SI where the condition output from div_scale
3325     // is not usable.
3326 
3327     LLT S32 = LLT::scalar(32);
3328 
3329     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3330     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3331     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3332     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3333 
3334     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3335                               Scale1Unmerge.getReg(1));
3336     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3337                               Scale0Unmerge.getReg(1));
3338     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3339   } else {
3340     Scale = DivScale1.getReg(1);
3341   }
3342 
3343   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3344     .addUse(Fma4.getReg(0))
3345     .addUse(Fma3.getReg(0))
3346     .addUse(Mul.getReg(0))
3347     .addUse(Scale)
3348     .setMIFlags(Flags);
3349 
3350   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3351     .addUse(Fmas.getReg(0))
3352     .addUse(RHS)
3353     .addUse(LHS)
3354     .setMIFlags(Flags);
3355 
3356   MI.eraseFromParent();
3357   return true;
3358 }
3359 
3360 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3361                                                  MachineRegisterInfo &MRI,
3362                                                  MachineIRBuilder &B) const {
3363   Register Res = MI.getOperand(0).getReg();
3364   Register LHS = MI.getOperand(2).getReg();
3365   Register RHS = MI.getOperand(3).getReg();
3366   uint16_t Flags = MI.getFlags();
3367 
3368   LLT S32 = LLT::scalar(32);
3369   LLT S1 = LLT::scalar(1);
3370 
3371   auto Abs = B.buildFAbs(S32, RHS, Flags);
3372   const APFloat C0Val(1.0f);
3373 
3374   auto C0 = B.buildConstant(S32, 0x6f800000);
3375   auto C1 = B.buildConstant(S32, 0x2f800000);
3376   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3377 
3378   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3379   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3380 
3381   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3382 
3383   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3384     .addUse(Mul0.getReg(0))
3385     .setMIFlags(Flags);
3386 
3387   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3388 
3389   B.buildFMul(Res, Sel, Mul1, Flags);
3390 
3391   MI.eraseFromParent();
3392   return true;
3393 }
3394 
3395 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3396 // FIXME: Why do we handle this one but not other removed instructions?
3397 //
3398 // Reciprocal square root.  The clamp prevents infinite results, clamping
3399 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3400 // +-max_float.
3401 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3402                                                     MachineRegisterInfo &MRI,
3403                                                     MachineIRBuilder &B) const {
3404   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3405     return true;
3406 
3407   Register Dst = MI.getOperand(0).getReg();
3408   Register Src = MI.getOperand(2).getReg();
3409   auto Flags = MI.getFlags();
3410 
3411   LLT Ty = MRI.getType(Dst);
3412 
3413   const fltSemantics *FltSemantics;
3414   if (Ty == LLT::scalar(32))
3415     FltSemantics = &APFloat::IEEEsingle();
3416   else if (Ty == LLT::scalar(64))
3417     FltSemantics = &APFloat::IEEEdouble();
3418   else
3419     return false;
3420 
3421   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3422     .addUse(Src)
3423     .setMIFlags(Flags);
3424 
3425   // We don't need to concern ourselves with the snan handling difference, since
3426   // the rsq quieted (or not) so use the one which will directly select.
3427   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3428   const bool UseIEEE = MFI->getMode().IEEE;
3429 
3430   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3431   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3432                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3433 
3434   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3435 
3436   if (UseIEEE)
3437     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3438   else
3439     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3440   MI.eraseFromParent();
3441   return true;
3442 }
3443 
3444 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3445   switch (IID) {
3446   case Intrinsic::amdgcn_ds_fadd:
3447     return AMDGPU::G_ATOMICRMW_FADD;
3448   case Intrinsic::amdgcn_ds_fmin:
3449     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3450   case Intrinsic::amdgcn_ds_fmax:
3451     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3452   default:
3453     llvm_unreachable("not a DS FP intrinsic");
3454   }
3455 }
3456 
3457 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3458                                                       MachineInstr &MI,
3459                                                       Intrinsic::ID IID) const {
3460   GISelChangeObserver &Observer = Helper.Observer;
3461   Observer.changingInstr(MI);
3462 
3463   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3464 
3465   // The remaining operands were used to set fields in the MemOperand on
3466   // construction.
3467   for (int I = 6; I > 3; --I)
3468     MI.RemoveOperand(I);
3469 
3470   MI.RemoveOperand(1); // Remove the intrinsic ID.
3471   Observer.changedInstr(MI);
3472   return true;
3473 }
3474 
3475 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3476                                             MachineRegisterInfo &MRI,
3477                                             MachineIRBuilder &B) const {
3478   uint64_t Offset =
3479     ST.getTargetLowering()->getImplicitParameterOffset(
3480       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3481   LLT DstTy = MRI.getType(DstReg);
3482   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3483 
3484   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3485   if (!loadInputValue(KernargPtrReg, B,
3486                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3487     return false;
3488 
3489   // FIXME: This should be nuw
3490   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3491   return true;
3492 }
3493 
3494 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3495                                                  MachineRegisterInfo &MRI,
3496                                                  MachineIRBuilder &B) const {
3497   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3498   if (!MFI->isEntryFunction()) {
3499     return legalizePreloadedArgIntrin(MI, MRI, B,
3500                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3501   }
3502 
3503   Register DstReg = MI.getOperand(0).getReg();
3504   if (!getImplicitArgPtr(DstReg, MRI, B))
3505     return false;
3506 
3507   MI.eraseFromParent();
3508   return true;
3509 }
3510 
3511 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3512                                               MachineRegisterInfo &MRI,
3513                                               MachineIRBuilder &B,
3514                                               unsigned AddrSpace) const {
3515   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3516   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3517   Register Hi32 = Unmerge.getReg(1);
3518 
3519   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3520   MI.eraseFromParent();
3521   return true;
3522 }
3523 
3524 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3525 // offset (the offset that is included in bounds checking and swizzling, to be
3526 // split between the instruction's voffset and immoffset fields) and soffset
3527 // (the offset that is excluded from bounds checking and swizzling, to go in
3528 // the instruction's soffset field).  This function takes the first kind of
3529 // offset and figures out how to split it between voffset and immoffset.
3530 std::tuple<Register, unsigned, unsigned>
3531 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3532                                         Register OrigOffset) const {
3533   const unsigned MaxImm = 4095;
3534   Register BaseReg;
3535   unsigned TotalConstOffset;
3536   const LLT S32 = LLT::scalar(32);
3537   MachineRegisterInfo &MRI = *B.getMRI();
3538 
3539   std::tie(BaseReg, TotalConstOffset) =
3540       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3541 
3542   unsigned ImmOffset = TotalConstOffset;
3543 
3544   // If BaseReg is a pointer, convert it to int.
3545   if (MRI.getType(BaseReg).isPointer())
3546     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3547 
3548   // If the immediate value is too big for the immoffset field, put the value
3549   // and -4096 into the immoffset field so that the value that is copied/added
3550   // for the voffset field is a multiple of 4096, and it stands more chance
3551   // of being CSEd with the copy/add for another similar load/store.
3552   // However, do not do that rounding down to a multiple of 4096 if that is a
3553   // negative number, as it appears to be illegal to have a negative offset
3554   // in the vgpr, even if adding the immediate offset makes it positive.
3555   unsigned Overflow = ImmOffset & ~MaxImm;
3556   ImmOffset -= Overflow;
3557   if ((int32_t)Overflow < 0) {
3558     Overflow += ImmOffset;
3559     ImmOffset = 0;
3560   }
3561 
3562   if (Overflow != 0) {
3563     if (!BaseReg) {
3564       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3565     } else {
3566       auto OverflowVal = B.buildConstant(S32, Overflow);
3567       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3568     }
3569   }
3570 
3571   if (!BaseReg)
3572     BaseReg = B.buildConstant(S32, 0).getReg(0);
3573 
3574   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3575 }
3576 
3577 /// Handle register layout difference for f16 images for some subtargets.
3578 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3579                                              MachineRegisterInfo &MRI,
3580                                              Register Reg,
3581                                              bool ImageStore) const {
3582   const LLT S16 = LLT::scalar(16);
3583   const LLT S32 = LLT::scalar(32);
3584   LLT StoreVT = MRI.getType(Reg);
3585   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3586 
3587   if (ST.hasUnpackedD16VMem()) {
3588     auto Unmerge = B.buildUnmerge(S16, Reg);
3589 
3590     SmallVector<Register, 4> WideRegs;
3591     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3592       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3593 
3594     int NumElts = StoreVT.getNumElements();
3595 
3596     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3597   }
3598 
3599   if (ImageStore && ST.hasImageStoreD16Bug()) {
3600     if (StoreVT.getNumElements() == 2) {
3601       SmallVector<Register, 4> PackedRegs;
3602       Reg = B.buildBitcast(S32, Reg).getReg(0);
3603       PackedRegs.push_back(Reg);
3604       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3605       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3606     }
3607 
3608     if (StoreVT.getNumElements() == 3) {
3609       SmallVector<Register, 4> PackedRegs;
3610       auto Unmerge = B.buildUnmerge(S16, Reg);
3611       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3612         PackedRegs.push_back(Unmerge.getReg(I));
3613       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3614       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3615       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3616     }
3617 
3618     if (StoreVT.getNumElements() == 4) {
3619       SmallVector<Register, 4> PackedRegs;
3620       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3621       auto Unmerge = B.buildUnmerge(S32, Reg);
3622       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3623         PackedRegs.push_back(Unmerge.getReg(I));
3624       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3625       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3626     }
3627 
3628     llvm_unreachable("invalid data type");
3629   }
3630 
3631   return Reg;
3632 }
3633 
3634 Register AMDGPULegalizerInfo::fixStoreSourceType(
3635   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3636   MachineRegisterInfo *MRI = B.getMRI();
3637   LLT Ty = MRI->getType(VData);
3638 
3639   const LLT S16 = LLT::scalar(16);
3640 
3641   // Fixup illegal register types for i8 stores.
3642   if (Ty == LLT::scalar(8) || Ty == S16) {
3643     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3644     return AnyExt;
3645   }
3646 
3647   if (Ty.isVector()) {
3648     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3649       if (IsFormat)
3650         return handleD16VData(B, *MRI, VData);
3651     }
3652   }
3653 
3654   return VData;
3655 }
3656 
3657 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3658                                               MachineRegisterInfo &MRI,
3659                                               MachineIRBuilder &B,
3660                                               bool IsTyped,
3661                                               bool IsFormat) const {
3662   Register VData = MI.getOperand(1).getReg();
3663   LLT Ty = MRI.getType(VData);
3664   LLT EltTy = Ty.getScalarType();
3665   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3666   const LLT S32 = LLT::scalar(32);
3667 
3668   VData = fixStoreSourceType(B, VData, IsFormat);
3669   Register RSrc = MI.getOperand(2).getReg();
3670 
3671   MachineMemOperand *MMO = *MI.memoperands_begin();
3672   const int MemSize = MMO->getSize();
3673 
3674   unsigned ImmOffset;
3675   unsigned TotalOffset;
3676 
3677   // The typed intrinsics add an immediate after the registers.
3678   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3679 
3680   // The struct intrinsic variants add one additional operand over raw.
3681   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3682   Register VIndex;
3683   int OpOffset = 0;
3684   if (HasVIndex) {
3685     VIndex = MI.getOperand(3).getReg();
3686     OpOffset = 1;
3687   }
3688 
3689   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3690   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3691 
3692   unsigned Format = 0;
3693   if (IsTyped) {
3694     Format = MI.getOperand(5 + OpOffset).getImm();
3695     ++OpOffset;
3696   }
3697 
3698   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3699 
3700   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3701   if (TotalOffset != 0)
3702     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3703 
3704   unsigned Opc;
3705   if (IsTyped) {
3706     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3707                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3708   } else if (IsFormat) {
3709     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3710                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3711   } else {
3712     switch (MemSize) {
3713     case 1:
3714       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3715       break;
3716     case 2:
3717       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3718       break;
3719     default:
3720       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3721       break;
3722     }
3723   }
3724 
3725   if (!VIndex)
3726     VIndex = B.buildConstant(S32, 0).getReg(0);
3727 
3728   auto MIB = B.buildInstr(Opc)
3729     .addUse(VData)              // vdata
3730     .addUse(RSrc)               // rsrc
3731     .addUse(VIndex)             // vindex
3732     .addUse(VOffset)            // voffset
3733     .addUse(SOffset)            // soffset
3734     .addImm(ImmOffset);         // offset(imm)
3735 
3736   if (IsTyped)
3737     MIB.addImm(Format);
3738 
3739   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3740      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3741      .addMemOperand(MMO);
3742 
3743   MI.eraseFromParent();
3744   return true;
3745 }
3746 
3747 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3748                                              MachineRegisterInfo &MRI,
3749                                              MachineIRBuilder &B,
3750                                              bool IsFormat,
3751                                              bool IsTyped) const {
3752   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3753   MachineMemOperand *MMO = *MI.memoperands_begin();
3754   const int MemSize = MMO->getSize();
3755   const LLT S32 = LLT::scalar(32);
3756 
3757   Register Dst = MI.getOperand(0).getReg();
3758   Register RSrc = MI.getOperand(2).getReg();
3759 
3760   // The typed intrinsics add an immediate after the registers.
3761   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3762 
3763   // The struct intrinsic variants add one additional operand over raw.
3764   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3765   Register VIndex;
3766   int OpOffset = 0;
3767   if (HasVIndex) {
3768     VIndex = MI.getOperand(3).getReg();
3769     OpOffset = 1;
3770   }
3771 
3772   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3773   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3774 
3775   unsigned Format = 0;
3776   if (IsTyped) {
3777     Format = MI.getOperand(5 + OpOffset).getImm();
3778     ++OpOffset;
3779   }
3780 
3781   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3782   unsigned ImmOffset;
3783   unsigned TotalOffset;
3784 
3785   LLT Ty = MRI.getType(Dst);
3786   LLT EltTy = Ty.getScalarType();
3787   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3788   const bool Unpacked = ST.hasUnpackedD16VMem();
3789 
3790   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3791   if (TotalOffset != 0)
3792     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3793 
3794   unsigned Opc;
3795 
3796   if (IsTyped) {
3797     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3798                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3799   } else if (IsFormat) {
3800     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3801                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3802   } else {
3803     switch (MemSize) {
3804     case 1:
3805       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3806       break;
3807     case 2:
3808       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3809       break;
3810     default:
3811       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3812       break;
3813     }
3814   }
3815 
3816   Register LoadDstReg;
3817 
3818   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3819   LLT UnpackedTy = Ty.changeElementSize(32);
3820 
3821   if (IsExtLoad)
3822     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3823   else if (Unpacked && IsD16 && Ty.isVector())
3824     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3825   else
3826     LoadDstReg = Dst;
3827 
3828   if (!VIndex)
3829     VIndex = B.buildConstant(S32, 0).getReg(0);
3830 
3831   auto MIB = B.buildInstr(Opc)
3832     .addDef(LoadDstReg)         // vdata
3833     .addUse(RSrc)               // rsrc
3834     .addUse(VIndex)             // vindex
3835     .addUse(VOffset)            // voffset
3836     .addUse(SOffset)            // soffset
3837     .addImm(ImmOffset);         // offset(imm)
3838 
3839   if (IsTyped)
3840     MIB.addImm(Format);
3841 
3842   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3843      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3844      .addMemOperand(MMO);
3845 
3846   if (LoadDstReg != Dst) {
3847     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3848 
3849     // Widen result for extending loads was widened.
3850     if (IsExtLoad)
3851       B.buildTrunc(Dst, LoadDstReg);
3852     else {
3853       // Repack to original 16-bit vector result
3854       // FIXME: G_TRUNC should work, but legalization currently fails
3855       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3856       SmallVector<Register, 4> Repack;
3857       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3858         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3859       B.buildMerge(Dst, Repack);
3860     }
3861   }
3862 
3863   MI.eraseFromParent();
3864   return true;
3865 }
3866 
3867 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3868                                                MachineIRBuilder &B,
3869                                                bool IsInc) const {
3870   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3871                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3872   B.buildInstr(Opc)
3873     .addDef(MI.getOperand(0).getReg())
3874     .addUse(MI.getOperand(2).getReg())
3875     .addUse(MI.getOperand(3).getReg())
3876     .cloneMemRefs(MI);
3877   MI.eraseFromParent();
3878   return true;
3879 }
3880 
3881 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3882   switch (IntrID) {
3883   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3884   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3885     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3886   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3887   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3888     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3889   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3890   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3891     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3892   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3893   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3894     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3895   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3896   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3897     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3898   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3899   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3900     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3901   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3902   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3903     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3904   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3905   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3906     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3907   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3908   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3909     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3910   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3911   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3912     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3913   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3914   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3915     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3916   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3917   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3918     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3919   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3920   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3921     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3922   case Intrinsic::amdgcn_buffer_atomic_fadd:
3923   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3924   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3925     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3926   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3927   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3928     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3929   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3930   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3931     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3932   default:
3933     llvm_unreachable("unhandled atomic opcode");
3934   }
3935 }
3936 
3937 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3938                                                MachineIRBuilder &B,
3939                                                Intrinsic::ID IID) const {
3940   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3941                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3942   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3943 
3944   Register Dst;
3945 
3946   int OpOffset = 0;
3947   if (HasReturn) {
3948     // A few FP atomics do not support return values.
3949     Dst = MI.getOperand(0).getReg();
3950   } else {
3951     OpOffset = -1;
3952   }
3953 
3954   Register VData = MI.getOperand(2 + OpOffset).getReg();
3955   Register CmpVal;
3956 
3957   if (IsCmpSwap) {
3958     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3959     ++OpOffset;
3960   }
3961 
3962   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3963   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3964 
3965   // The struct intrinsic variants add one additional operand over raw.
3966   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3967   Register VIndex;
3968   if (HasVIndex) {
3969     VIndex = MI.getOperand(4 + OpOffset).getReg();
3970     ++OpOffset;
3971   }
3972 
3973   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3974   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3975   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3976 
3977   MachineMemOperand *MMO = *MI.memoperands_begin();
3978 
3979   unsigned ImmOffset;
3980   unsigned TotalOffset;
3981   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3982   if (TotalOffset != 0)
3983     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3984 
3985   if (!VIndex)
3986     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3987 
3988   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3989 
3990   if (HasReturn)
3991     MIB.addDef(Dst);
3992 
3993   MIB.addUse(VData); // vdata
3994 
3995   if (IsCmpSwap)
3996     MIB.addReg(CmpVal);
3997 
3998   MIB.addUse(RSrc)               // rsrc
3999      .addUse(VIndex)             // vindex
4000      .addUse(VOffset)            // voffset
4001      .addUse(SOffset)            // soffset
4002      .addImm(ImmOffset)          // offset(imm)
4003      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4004      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4005      .addMemOperand(MMO);
4006 
4007   MI.eraseFromParent();
4008   return true;
4009 }
4010 
4011 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4012 /// vector with s16 typed elements.
4013 static void packImageA16AddressToDwords(
4014     MachineIRBuilder &B, MachineInstr &MI,
4015     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4016     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4017   const LLT S16 = LLT::scalar(16);
4018   const LLT V2S16 = LLT::vector(2, 16);
4019 
4020   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4021     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4022     if (!SrcOp.isReg())
4023       continue; // _L to _LZ may have eliminated this.
4024 
4025     Register AddrReg = SrcOp.getReg();
4026 
4027     if (I < Intr->GradientStart) {
4028       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4029       PackedAddrs.push_back(AddrReg);
4030     } else {
4031       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4032       // derivatives dx/dh and dx/dv are packed with undef.
4033       if (((I + 1) >= EndIdx) ||
4034           ((Intr->NumGradients / 2) % 2 == 1 &&
4035            (I == static_cast<unsigned>(Intr->GradientStart +
4036                                        (Intr->NumGradients / 2) - 1) ||
4037             I == static_cast<unsigned>(Intr->GradientStart +
4038                                        Intr->NumGradients - 1))) ||
4039           // Check for _L to _LZ optimization
4040           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4041         PackedAddrs.push_back(
4042             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4043                 .getReg(0));
4044       } else {
4045         PackedAddrs.push_back(
4046             B.buildBuildVector(
4047                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4048                 .getReg(0));
4049         ++I;
4050       }
4051     }
4052   }
4053 }
4054 
4055 /// Convert from separate vaddr components to a single vector address register,
4056 /// and replace the remaining operands with $noreg.
4057 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4058                                      int DimIdx, int NumVAddrs) {
4059   const LLT S32 = LLT::scalar(32);
4060 
4061   SmallVector<Register, 8> AddrRegs;
4062   for (int I = 0; I != NumVAddrs; ++I) {
4063     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4064     if (SrcOp.isReg()) {
4065       AddrRegs.push_back(SrcOp.getReg());
4066       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4067     }
4068   }
4069 
4070   int NumAddrRegs = AddrRegs.size();
4071   if (NumAddrRegs != 1) {
4072     // Round up to 8 elements for v5-v7
4073     // FIXME: Missing intermediate sized register classes and instructions.
4074     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4075       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4076       auto Undef = B.buildUndef(S32);
4077       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4078       NumAddrRegs = RoundedNumRegs;
4079     }
4080 
4081     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4082     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4083   }
4084 
4085   for (int I = 1; I != NumVAddrs; ++I) {
4086     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4087     if (SrcOp.isReg())
4088       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4089   }
4090 }
4091 
4092 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4093 ///
4094 /// Depending on the subtarget, load/store with 16-bit element data need to be
4095 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4096 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4097 /// registers.
4098 ///
4099 /// We don't want to directly select image instructions just yet, but also want
4100 /// to exposes all register repacking to the legalizer/combiners. We also don't
4101 /// want a selected instrution entering RegBankSelect. In order to avoid
4102 /// defining a multitude of intermediate image instructions, directly hack on
4103 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4104 /// now unnecessary arguments with $noreg.
4105 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4106     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4107     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4108 
4109   const unsigned NumDefs = MI.getNumExplicitDefs();
4110   const unsigned ArgOffset = NumDefs + 1;
4111   bool IsTFE = NumDefs == 2;
4112   // We are only processing the operands of d16 image operations on subtargets
4113   // that use the unpacked register layout, or need to repack the TFE result.
4114 
4115   // TODO: Do we need to guard against already legalized intrinsics?
4116   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4117       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4118 
4119   MachineRegisterInfo *MRI = B.getMRI();
4120   const LLT S32 = LLT::scalar(32);
4121   const LLT S16 = LLT::scalar(16);
4122   const LLT V2S16 = LLT::vector(2, 16);
4123 
4124   unsigned DMask = 0;
4125 
4126   // Check for 16 bit addresses and pack if true.
4127   LLT GradTy =
4128       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4129   LLT AddrTy =
4130       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4131   const bool IsG16 = GradTy == S16;
4132   const bool IsA16 = AddrTy == S16;
4133 
4134   int DMaskLanes = 0;
4135   if (!BaseOpcode->Atomic) {
4136     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4137     if (BaseOpcode->Gather4) {
4138       DMaskLanes = 4;
4139     } else if (DMask != 0) {
4140       DMaskLanes = countPopulation(DMask);
4141     } else if (!IsTFE && !BaseOpcode->Store) {
4142       // If dmask is 0, this is a no-op load. This can be eliminated.
4143       B.buildUndef(MI.getOperand(0));
4144       MI.eraseFromParent();
4145       return true;
4146     }
4147   }
4148 
4149   Observer.changingInstr(MI);
4150   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4151 
4152   unsigned NewOpcode = NumDefs == 0 ?
4153     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4154 
4155   // Track that we legalized this
4156   MI.setDesc(B.getTII().get(NewOpcode));
4157 
4158   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4159   // dmask to be at least 1 otherwise the instruction will fail
4160   if (IsTFE && DMask == 0) {
4161     DMask = 0x1;
4162     DMaskLanes = 1;
4163     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4164   }
4165 
4166   if (BaseOpcode->Atomic) {
4167     Register VData0 = MI.getOperand(2).getReg();
4168     LLT Ty = MRI->getType(VData0);
4169 
4170     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4171     if (Ty.isVector())
4172       return false;
4173 
4174     if (BaseOpcode->AtomicX2) {
4175       Register VData1 = MI.getOperand(3).getReg();
4176       // The two values are packed in one register.
4177       LLT PackedTy = LLT::vector(2, Ty);
4178       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4179       MI.getOperand(2).setReg(Concat.getReg(0));
4180       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4181     }
4182   }
4183 
4184   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4185 
4186   // Optimize _L to _LZ when _L is zero
4187   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4188           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4189     const ConstantFP *ConstantLod;
4190 
4191     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4192                  m_GFCst(ConstantLod))) {
4193       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4194         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4195         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4196             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4197                                                       Intr->Dim);
4198 
4199         // The starting indexes should remain in the same place.
4200         --CorrectedNumVAddrs;
4201 
4202         MI.getOperand(MI.getNumExplicitDefs())
4203             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4204         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4205         Intr = NewImageDimIntr;
4206       }
4207     }
4208   }
4209 
4210   // Optimize _mip away, when 'lod' is zero
4211   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4212     int64_t ConstantLod;
4213     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4214                  m_ICst(ConstantLod))) {
4215       if (ConstantLod == 0) {
4216         // TODO: Change intrinsic opcode and remove operand instead or replacing
4217         // it with 0, as the _L to _LZ handling is done above.
4218         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4219         --CorrectedNumVAddrs;
4220       }
4221     }
4222   }
4223 
4224   // Rewrite the addressing register layout before doing anything else.
4225   if (IsA16 || IsG16) {
4226     if (IsA16) {
4227       // Target must support the feature and gradients need to be 16 bit too
4228       if (!ST.hasA16() || !IsG16)
4229         return false;
4230     } else if (!ST.hasG16())
4231       return false;
4232 
4233     if (Intr->NumVAddrs > 1) {
4234       SmallVector<Register, 4> PackedRegs;
4235       // Don't compress addresses for G16
4236       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4237       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4238                                   PackEndIdx);
4239 
4240       if (!IsA16) {
4241         // Add uncompressed address
4242         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4243           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4244           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4245           PackedRegs.push_back(AddrReg);
4246         }
4247       }
4248 
4249       // See also below in the non-a16 branch
4250       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4251 
4252       if (!UseNSA && PackedRegs.size() > 1) {
4253         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4254         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4255         PackedRegs[0] = Concat.getReg(0);
4256         PackedRegs.resize(1);
4257       }
4258 
4259       const unsigned NumPacked = PackedRegs.size();
4260       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4261         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4262         if (!SrcOp.isReg()) {
4263           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4264           continue;
4265         }
4266 
4267         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4268 
4269         if (I - Intr->VAddrStart < NumPacked)
4270           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4271         else
4272           SrcOp.setReg(AMDGPU::NoRegister);
4273       }
4274     }
4275   } else {
4276     // If the register allocator cannot place the address registers contiguously
4277     // without introducing moves, then using the non-sequential address encoding
4278     // is always preferable, since it saves VALU instructions and is usually a
4279     // wash in terms of code size or even better.
4280     //
4281     // However, we currently have no way of hinting to the register allocator
4282     // that MIMG addresses should be placed contiguously when it is possible to
4283     // do so, so force non-NSA for the common 2-address case as a heuristic.
4284     //
4285     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4286     // allocation when possible.
4287     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4288 
4289     if (!UseNSA && Intr->NumVAddrs > 1)
4290       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4291                                Intr->NumVAddrs);
4292   }
4293 
4294   int Flags = 0;
4295   if (IsA16)
4296     Flags |= 1;
4297   if (IsG16)
4298     Flags |= 2;
4299   MI.addOperand(MachineOperand::CreateImm(Flags));
4300 
4301   if (BaseOpcode->Store) { // No TFE for stores?
4302     // TODO: Handle dmask trim
4303     Register VData = MI.getOperand(1).getReg();
4304     LLT Ty = MRI->getType(VData);
4305     if (!Ty.isVector() || Ty.getElementType() != S16)
4306       return true;
4307 
4308     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4309     if (RepackedReg != VData) {
4310       MI.getOperand(1).setReg(RepackedReg);
4311     }
4312 
4313     return true;
4314   }
4315 
4316   Register DstReg = MI.getOperand(0).getReg();
4317   LLT Ty = MRI->getType(DstReg);
4318   const LLT EltTy = Ty.getScalarType();
4319   const bool IsD16 = Ty.getScalarType() == S16;
4320   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4321 
4322   // Confirm that the return type is large enough for the dmask specified
4323   if (NumElts < DMaskLanes)
4324     return false;
4325 
4326   if (NumElts > 4 || DMaskLanes > 4)
4327     return false;
4328 
4329   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4330   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4331 
4332   // The raw dword aligned data component of the load. The only legal cases
4333   // where this matters should be when using the packed D16 format, for
4334   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4335   LLT RoundedTy;
4336 
4337   // S32 vector to to cover all data, plus TFE result element.
4338   LLT TFETy;
4339 
4340   // Register type to use for each loaded component. Will be S32 or V2S16.
4341   LLT RegTy;
4342 
4343   if (IsD16 && ST.hasUnpackedD16VMem()) {
4344     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4345     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4346     RegTy = S32;
4347   } else {
4348     unsigned EltSize = EltTy.getSizeInBits();
4349     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4350     unsigned RoundedSize = 32 * RoundedElts;
4351     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4352     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4353     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4354   }
4355 
4356   // The return type does not need adjustment.
4357   // TODO: Should we change s16 case to s32 or <2 x s16>?
4358   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4359     return true;
4360 
4361   Register Dst1Reg;
4362 
4363   // Insert after the instruction.
4364   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4365 
4366   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4367   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4368   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4369   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4370 
4371   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4372 
4373   MI.getOperand(0).setReg(NewResultReg);
4374 
4375   // In the IR, TFE is supposed to be used with a 2 element struct return
4376   // type. The intruction really returns these two values in one contiguous
4377   // register, with one additional dword beyond the loaded data. Rewrite the
4378   // return type to use a single register result.
4379 
4380   if (IsTFE) {
4381     Dst1Reg = MI.getOperand(1).getReg();
4382     if (MRI->getType(Dst1Reg) != S32)
4383       return false;
4384 
4385     // TODO: Make sure the TFE operand bit is set.
4386     MI.RemoveOperand(1);
4387 
4388     // Handle the easy case that requires no repack instructions.
4389     if (Ty == S32) {
4390       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4391       return true;
4392     }
4393   }
4394 
4395   // Now figure out how to copy the new result register back into the old
4396   // result.
4397   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4398 
4399   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4400 
4401   if (ResultNumRegs == 1) {
4402     assert(!IsTFE);
4403     ResultRegs[0] = NewResultReg;
4404   } else {
4405     // We have to repack into a new vector of some kind.
4406     for (int I = 0; I != NumDataRegs; ++I)
4407       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4408     B.buildUnmerge(ResultRegs, NewResultReg);
4409 
4410     // Drop the final TFE element to get the data part. The TFE result is
4411     // directly written to the right place already.
4412     if (IsTFE)
4413       ResultRegs.resize(NumDataRegs);
4414   }
4415 
4416   // For an s16 scalar result, we form an s32 result with a truncate regardless
4417   // of packed vs. unpacked.
4418   if (IsD16 && !Ty.isVector()) {
4419     B.buildTrunc(DstReg, ResultRegs[0]);
4420     return true;
4421   }
4422 
4423   // Avoid a build/concat_vector of 1 entry.
4424   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4425     B.buildBitcast(DstReg, ResultRegs[0]);
4426     return true;
4427   }
4428 
4429   assert(Ty.isVector());
4430 
4431   if (IsD16) {
4432     // For packed D16 results with TFE enabled, all the data components are
4433     // S32. Cast back to the expected type.
4434     //
4435     // TODO: We don't really need to use load s32 elements. We would only need one
4436     // cast for the TFE result if a multiple of v2s16 was used.
4437     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4438       for (Register &Reg : ResultRegs)
4439         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4440     } else if (ST.hasUnpackedD16VMem()) {
4441       for (Register &Reg : ResultRegs)
4442         Reg = B.buildTrunc(S16, Reg).getReg(0);
4443     }
4444   }
4445 
4446   auto padWithUndef = [&](LLT Ty, int NumElts) {
4447     if (NumElts == 0)
4448       return;
4449     Register Undef = B.buildUndef(Ty).getReg(0);
4450     for (int I = 0; I != NumElts; ++I)
4451       ResultRegs.push_back(Undef);
4452   };
4453 
4454   // Pad out any elements eliminated due to the dmask.
4455   LLT ResTy = MRI->getType(ResultRegs[0]);
4456   if (!ResTy.isVector()) {
4457     padWithUndef(ResTy, NumElts - ResultRegs.size());
4458     B.buildBuildVector(DstReg, ResultRegs);
4459     return true;
4460   }
4461 
4462   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4463   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4464 
4465   // Deal with the one annoying legal case.
4466   const LLT V3S16 = LLT::vector(3, 16);
4467   if (Ty == V3S16) {
4468     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4469     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4470     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4471     return true;
4472   }
4473 
4474   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4475   B.buildConcatVectors(DstReg, ResultRegs);
4476   return true;
4477 }
4478 
4479 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4480   LegalizerHelper &Helper, MachineInstr &MI) const {
4481   MachineIRBuilder &B = Helper.MIRBuilder;
4482   GISelChangeObserver &Observer = Helper.Observer;
4483 
4484   Register Dst = MI.getOperand(0).getReg();
4485   LLT Ty = B.getMRI()->getType(Dst);
4486   unsigned Size = Ty.getSizeInBits();
4487   MachineFunction &MF = B.getMF();
4488 
4489   Observer.changingInstr(MI);
4490 
4491   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4492     Ty = getBitcastRegisterType(Ty);
4493     Helper.bitcastDst(MI, Ty, 0);
4494     Dst = MI.getOperand(0).getReg();
4495     B.setInsertPt(B.getMBB(), MI);
4496   }
4497 
4498   // FIXME: We don't really need this intermediate instruction. The intrinsic
4499   // should be fixed to have a memory operand. Since it's readnone, we're not
4500   // allowed to add one.
4501   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4502   MI.RemoveOperand(1); // Remove intrinsic ID
4503 
4504   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4505   // TODO: Should this use datalayout alignment?
4506   const unsigned MemSize = (Size + 7) / 8;
4507   const Align MemAlign(4);
4508   MachineMemOperand *MMO = MF.getMachineMemOperand(
4509       MachinePointerInfo(),
4510       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4511           MachineMemOperand::MOInvariant,
4512       MemSize, MemAlign);
4513   MI.addMemOperand(MF, MMO);
4514 
4515   // There are no 96-bit result scalar loads, but widening to 128-bit should
4516   // always be legal. We may need to restore this to a 96-bit result if it turns
4517   // out this needs to be converted to a vector load during RegBankSelect.
4518   if (!isPowerOf2_32(Size)) {
4519     if (Ty.isVector())
4520       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4521     else
4522       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4523   }
4524 
4525   Observer.changedInstr(MI);
4526   return true;
4527 }
4528 
4529 // TODO: Move to selection
4530 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4531                                                 MachineRegisterInfo &MRI,
4532                                                 MachineIRBuilder &B) const {
4533   if (!ST.isTrapHandlerEnabled() ||
4534       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4535     return legalizeTrapEndpgm(MI, MRI, B);
4536 
4537   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4538     switch (*HsaAbiVer) {
4539     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4540     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4541       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4542     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4543       return ST.supportsGetDoorbellID() ?
4544           legalizeTrapHsa(MI, MRI, B) :
4545           legalizeTrapHsaQueuePtr(MI, MRI, B);
4546     }
4547   }
4548 
4549   llvm_unreachable("Unknown trap handler");
4550 }
4551 
4552 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4553     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4554   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4555   MI.eraseFromParent();
4556   return true;
4557 }
4558 
4559 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4560     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4561   // Pass queue pointer to trap handler as input, and insert trap instruction
4562   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4563   Register LiveIn =
4564     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4565   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4566     return false;
4567 
4568   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4569   B.buildCopy(SGPR01, LiveIn);
4570   B.buildInstr(AMDGPU::S_TRAP)
4571       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4572       .addReg(SGPR01, RegState::Implicit);
4573 
4574   MI.eraseFromParent();
4575   return true;
4576 }
4577 
4578 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4579     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4580   B.buildInstr(AMDGPU::S_TRAP)
4581       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4582   MI.eraseFromParent();
4583   return true;
4584 }
4585 
4586 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4587     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4588   // Is non-HSA path or trap-handler disabled? then, report a warning
4589   // accordingly
4590   if (!ST.isTrapHandlerEnabled() ||
4591       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4592     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4593                                      "debugtrap handler not supported",
4594                                      MI.getDebugLoc(), DS_Warning);
4595     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4596     Ctx.diagnose(NoTrap);
4597   } else {
4598     // Insert debug-trap instruction
4599     B.buildInstr(AMDGPU::S_TRAP)
4600         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4601   }
4602 
4603   MI.eraseFromParent();
4604   return true;
4605 }
4606 
4607 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4608                                                MachineIRBuilder &B) const {
4609   MachineRegisterInfo &MRI = *B.getMRI();
4610   const LLT S16 = LLT::scalar(16);
4611   const LLT S32 = LLT::scalar(32);
4612 
4613   Register DstReg = MI.getOperand(0).getReg();
4614   Register NodePtr = MI.getOperand(2).getReg();
4615   Register RayExtent = MI.getOperand(3).getReg();
4616   Register RayOrigin = MI.getOperand(4).getReg();
4617   Register RayDir = MI.getOperand(5).getReg();
4618   Register RayInvDir = MI.getOperand(6).getReg();
4619   Register TDescr = MI.getOperand(7).getReg();
4620 
4621   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4622   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4623   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4624                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4625                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4626                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4627 
4628   SmallVector<Register, 12> Ops;
4629   if (Is64) {
4630     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4631     Ops.push_back(Unmerge.getReg(0));
4632     Ops.push_back(Unmerge.getReg(1));
4633   } else {
4634     Ops.push_back(NodePtr);
4635   }
4636   Ops.push_back(RayExtent);
4637 
4638   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4639     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4640     Ops.push_back(Unmerge.getReg(0));
4641     Ops.push_back(Unmerge.getReg(1));
4642     Ops.push_back(Unmerge.getReg(2));
4643   };
4644 
4645   packLanes(RayOrigin);
4646   if (IsA16) {
4647     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4648     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4649     Register R1 = MRI.createGenericVirtualRegister(S32);
4650     Register R2 = MRI.createGenericVirtualRegister(S32);
4651     Register R3 = MRI.createGenericVirtualRegister(S32);
4652     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4653     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4654     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4655     Ops.push_back(R1);
4656     Ops.push_back(R2);
4657     Ops.push_back(R3);
4658   } else {
4659     packLanes(RayDir);
4660     packLanes(RayInvDir);
4661   }
4662 
4663   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4664     .addDef(DstReg)
4665     .addImm(Opcode);
4666 
4667   for (Register R : Ops) {
4668     MIB.addUse(R);
4669   }
4670 
4671   MIB.addUse(TDescr)
4672      .addImm(IsA16 ? 1 : 0)
4673      .cloneMemRefs(MI);
4674 
4675   MI.eraseFromParent();
4676   return true;
4677 }
4678 
4679 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4680                                             MachineInstr &MI) const {
4681   MachineIRBuilder &B = Helper.MIRBuilder;
4682   MachineRegisterInfo &MRI = *B.getMRI();
4683 
4684   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4685   auto IntrID = MI.getIntrinsicID();
4686   switch (IntrID) {
4687   case Intrinsic::amdgcn_if:
4688   case Intrinsic::amdgcn_else: {
4689     MachineInstr *Br = nullptr;
4690     MachineBasicBlock *UncondBrTarget = nullptr;
4691     bool Negated = false;
4692     if (MachineInstr *BrCond =
4693             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4694       const SIRegisterInfo *TRI
4695         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4696 
4697       Register Def = MI.getOperand(1).getReg();
4698       Register Use = MI.getOperand(3).getReg();
4699 
4700       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4701 
4702       if (Negated)
4703         std::swap(CondBrTarget, UncondBrTarget);
4704 
4705       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4706       if (IntrID == Intrinsic::amdgcn_if) {
4707         B.buildInstr(AMDGPU::SI_IF)
4708           .addDef(Def)
4709           .addUse(Use)
4710           .addMBB(UncondBrTarget);
4711       } else {
4712         B.buildInstr(AMDGPU::SI_ELSE)
4713             .addDef(Def)
4714             .addUse(Use)
4715             .addMBB(UncondBrTarget);
4716       }
4717 
4718       if (Br) {
4719         Br->getOperand(0).setMBB(CondBrTarget);
4720       } else {
4721         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4722         // since we're swapping branch targets it needs to be reinserted.
4723         // FIXME: IRTranslator should probably not do this
4724         B.buildBr(*CondBrTarget);
4725       }
4726 
4727       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4728       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4729       MI.eraseFromParent();
4730       BrCond->eraseFromParent();
4731       return true;
4732     }
4733 
4734     return false;
4735   }
4736   case Intrinsic::amdgcn_loop: {
4737     MachineInstr *Br = nullptr;
4738     MachineBasicBlock *UncondBrTarget = nullptr;
4739     bool Negated = false;
4740     if (MachineInstr *BrCond =
4741             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4742       const SIRegisterInfo *TRI
4743         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4744 
4745       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4746       Register Reg = MI.getOperand(2).getReg();
4747 
4748       if (Negated)
4749         std::swap(CondBrTarget, UncondBrTarget);
4750 
4751       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4752       B.buildInstr(AMDGPU::SI_LOOP)
4753         .addUse(Reg)
4754         .addMBB(UncondBrTarget);
4755 
4756       if (Br)
4757         Br->getOperand(0).setMBB(CondBrTarget);
4758       else
4759         B.buildBr(*CondBrTarget);
4760 
4761       MI.eraseFromParent();
4762       BrCond->eraseFromParent();
4763       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4764       return true;
4765     }
4766 
4767     return false;
4768   }
4769   case Intrinsic::amdgcn_kernarg_segment_ptr:
4770     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4771       // This only makes sense to call in a kernel, so just lower to null.
4772       B.buildConstant(MI.getOperand(0).getReg(), 0);
4773       MI.eraseFromParent();
4774       return true;
4775     }
4776 
4777     return legalizePreloadedArgIntrin(
4778       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4779   case Intrinsic::amdgcn_implicitarg_ptr:
4780     return legalizeImplicitArgPtr(MI, MRI, B);
4781   case Intrinsic::amdgcn_workitem_id_x:
4782     return legalizePreloadedArgIntrin(MI, MRI, B,
4783                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4784   case Intrinsic::amdgcn_workitem_id_y:
4785     return legalizePreloadedArgIntrin(MI, MRI, B,
4786                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4787   case Intrinsic::amdgcn_workitem_id_z:
4788     return legalizePreloadedArgIntrin(MI, MRI, B,
4789                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4790   case Intrinsic::amdgcn_workgroup_id_x:
4791     return legalizePreloadedArgIntrin(MI, MRI, B,
4792                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4793   case Intrinsic::amdgcn_workgroup_id_y:
4794     return legalizePreloadedArgIntrin(MI, MRI, B,
4795                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4796   case Intrinsic::amdgcn_workgroup_id_z:
4797     return legalizePreloadedArgIntrin(MI, MRI, B,
4798                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4799   case Intrinsic::amdgcn_dispatch_ptr:
4800     return legalizePreloadedArgIntrin(MI, MRI, B,
4801                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4802   case Intrinsic::amdgcn_queue_ptr:
4803     return legalizePreloadedArgIntrin(MI, MRI, B,
4804                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4805   case Intrinsic::amdgcn_implicit_buffer_ptr:
4806     return legalizePreloadedArgIntrin(
4807       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4808   case Intrinsic::amdgcn_dispatch_id:
4809     return legalizePreloadedArgIntrin(MI, MRI, B,
4810                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4811   case Intrinsic::amdgcn_fdiv_fast:
4812     return legalizeFDIVFastIntrin(MI, MRI, B);
4813   case Intrinsic::amdgcn_is_shared:
4814     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4815   case Intrinsic::amdgcn_is_private:
4816     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4817   case Intrinsic::amdgcn_wavefrontsize: {
4818     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4819     MI.eraseFromParent();
4820     return true;
4821   }
4822   case Intrinsic::amdgcn_s_buffer_load:
4823     return legalizeSBufferLoad(Helper, MI);
4824   case Intrinsic::amdgcn_raw_buffer_store:
4825   case Intrinsic::amdgcn_struct_buffer_store:
4826     return legalizeBufferStore(MI, MRI, B, false, false);
4827   case Intrinsic::amdgcn_raw_buffer_store_format:
4828   case Intrinsic::amdgcn_struct_buffer_store_format:
4829     return legalizeBufferStore(MI, MRI, B, false, true);
4830   case Intrinsic::amdgcn_raw_tbuffer_store:
4831   case Intrinsic::amdgcn_struct_tbuffer_store:
4832     return legalizeBufferStore(MI, MRI, B, true, true);
4833   case Intrinsic::amdgcn_raw_buffer_load:
4834   case Intrinsic::amdgcn_struct_buffer_load:
4835     return legalizeBufferLoad(MI, MRI, B, false, false);
4836   case Intrinsic::amdgcn_raw_buffer_load_format:
4837   case Intrinsic::amdgcn_struct_buffer_load_format:
4838     return legalizeBufferLoad(MI, MRI, B, true, false);
4839   case Intrinsic::amdgcn_raw_tbuffer_load:
4840   case Intrinsic::amdgcn_struct_tbuffer_load:
4841     return legalizeBufferLoad(MI, MRI, B, true, true);
4842   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4843   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4844   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4845   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4846   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4847   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4848   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4849   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4850   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4851   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4852   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4853   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4854   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4855   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4856   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4857   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4858   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4859   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4860   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4861   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4862   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4863   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4864   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4865   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4866   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4867   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4868   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4869   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4870   case Intrinsic::amdgcn_buffer_atomic_fadd:
4871   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4872   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4873   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4874   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4875     return legalizeBufferAtomic(MI, B, IntrID);
4876   case Intrinsic::amdgcn_atomic_inc:
4877     return legalizeAtomicIncDec(MI, B, true);
4878   case Intrinsic::amdgcn_atomic_dec:
4879     return legalizeAtomicIncDec(MI, B, false);
4880   case Intrinsic::trap:
4881     return legalizeTrapIntrinsic(MI, MRI, B);
4882   case Intrinsic::debugtrap:
4883     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4884   case Intrinsic::amdgcn_rsq_clamp:
4885     return legalizeRsqClampIntrinsic(MI, MRI, B);
4886   case Intrinsic::amdgcn_ds_fadd:
4887   case Intrinsic::amdgcn_ds_fmin:
4888   case Intrinsic::amdgcn_ds_fmax:
4889     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4890   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4891     return legalizeBVHIntrinsic(MI, B);
4892   default: {
4893     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4894             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4895       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4896     return true;
4897   }
4898   }
4899 
4900   return true;
4901 }
4902