1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
364                                          const GCNTargetMachine &TM)
365   :  ST(ST_) {
366   using namespace TargetOpcode;
367 
368   auto GetAddrSpacePtr = [&TM](unsigned AS) {
369     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
370   };
371 
372   const LLT S1 = LLT::scalar(1);
373   const LLT S16 = LLT::scalar(16);
374   const LLT S32 = LLT::scalar(32);
375   const LLT S64 = LLT::scalar(64);
376   const LLT S128 = LLT::scalar(128);
377   const LLT S256 = LLT::scalar(256);
378   const LLT S512 = LLT::scalar(512);
379   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
380 
381   const LLT V2S16 = LLT::vector(2, 16);
382   const LLT V4S16 = LLT::vector(4, 16);
383 
384   const LLT V2S32 = LLT::vector(2, 32);
385   const LLT V3S32 = LLT::vector(3, 32);
386   const LLT V4S32 = LLT::vector(4, 32);
387   const LLT V5S32 = LLT::vector(5, 32);
388   const LLT V6S32 = LLT::vector(6, 32);
389   const LLT V7S32 = LLT::vector(7, 32);
390   const LLT V8S32 = LLT::vector(8, 32);
391   const LLT V9S32 = LLT::vector(9, 32);
392   const LLT V10S32 = LLT::vector(10, 32);
393   const LLT V11S32 = LLT::vector(11, 32);
394   const LLT V12S32 = LLT::vector(12, 32);
395   const LLT V13S32 = LLT::vector(13, 32);
396   const LLT V14S32 = LLT::vector(14, 32);
397   const LLT V15S32 = LLT::vector(15, 32);
398   const LLT V16S32 = LLT::vector(16, 32);
399   const LLT V32S32 = LLT::vector(32, 32);
400 
401   const LLT V2S64 = LLT::vector(2, 64);
402   const LLT V3S64 = LLT::vector(3, 64);
403   const LLT V4S64 = LLT::vector(4, 64);
404   const LLT V5S64 = LLT::vector(5, 64);
405   const LLT V6S64 = LLT::vector(6, 64);
406   const LLT V7S64 = LLT::vector(7, 64);
407   const LLT V8S64 = LLT::vector(8, 64);
408   const LLT V16S64 = LLT::vector(16, 64);
409 
410   std::initializer_list<LLT> AllS32Vectors =
411     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
412      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
413   std::initializer_list<LLT> AllS64Vectors =
414     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
415 
416   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
417   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
418   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
419   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
420   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
421   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
422   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
423 
424   const LLT CodePtr = FlatPtr;
425 
426   const std::initializer_list<LLT> AddrSpaces64 = {
427     GlobalPtr, ConstantPtr, FlatPtr
428   };
429 
430   const std::initializer_list<LLT> AddrSpaces32 = {
431     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
432   };
433 
434   const std::initializer_list<LLT> FPTypesBase = {
435     S32, S64
436   };
437 
438   const std::initializer_list<LLT> FPTypes16 = {
439     S32, S64, S16
440   };
441 
442   const std::initializer_list<LLT> FPTypesPK16 = {
443     S32, S64, S16, V2S16
444   };
445 
446   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
447 
448   setAction({G_BRCOND, S1}, Legal); // VCC branches
449   setAction({G_BRCOND, S32}, Legal); // SCC branches
450 
451   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
452   // elements for v3s16
453   getActionDefinitionsBuilder(G_PHI)
454     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
455     .legalFor(AllS32Vectors)
456     .legalFor(AllS64Vectors)
457     .legalFor(AddrSpaces64)
458     .legalFor(AddrSpaces32)
459     .legalIf(isPointer(0))
460     .clampScalar(0, S16, S256)
461     .widenScalarToNextPow2(0, 32)
462     .clampMaxNumElements(0, S32, 16)
463     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
464     .scalarize(0);
465 
466   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
467     // Full set of gfx9 features.
468     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
469       .legalFor({S32, S16, V2S16})
470       .clampScalar(0, S16, S32)
471       .clampMaxNumElements(0, S16, 2)
472       .scalarize(0)
473       .widenScalarToNextPow2(0, 32);
474 
475     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
476       .legalFor({S32, S16, V2S16}) // Clamp modifier
477       .minScalarOrElt(0, S16)
478       .clampMaxNumElements(0, S16, 2)
479       .scalarize(0)
480       .widenScalarToNextPow2(0, 32)
481       .lower();
482   } else if (ST.has16BitInsts()) {
483     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
484       .legalFor({S32, S16})
485       .clampScalar(0, S16, S32)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
488 
489     // Technically the saturating operations require clamp bit support, but this
490     // was introduced at the same time as 16-bit operations.
491     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
492       .legalFor({S32, S16}) // Clamp modifier
493       .minScalar(0, S16)
494       .scalarize(0)
495       .widenScalarToNextPow2(0, 16)
496       .lower();
497 
498     // We're just lowering this, but it helps get a better result to try to
499     // coerce to the desired type first.
500     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
501       .minScalar(0, S16)
502       .scalarize(0)
503       .lower();
504   } else {
505     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
506       .legalFor({S32})
507       .clampScalar(0, S32, S32)
508       .scalarize(0);
509 
510     if (ST.hasIntClamp()) {
511       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
512         .legalFor({S32}) // Clamp modifier.
513         .scalarize(0)
514         .minScalarOrElt(0, S32)
515         .lower();
516     } else {
517       // Clamp bit support was added in VI, along with 16-bit operations.
518       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
519         .minScalar(0, S32)
520         .scalarize(0)
521         .lower();
522     }
523 
524     // FIXME: DAG expansion gets better results. The widening uses the smaller
525     // range values and goes for the min/max lowering directly.
526     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
527       .minScalar(0, S32)
528       .scalarize(0)
529       .lower();
530   }
531 
532   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
533     .customFor({S32, S64})
534     .clampScalar(0, S32, S64)
535     .widenScalarToNextPow2(0, 32)
536     .scalarize(0);
537 
538   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
539     .legalFor({S32})
540     .clampScalar(0, S32, S32)
541     .scalarize(0);
542 
543   // Report legal for any types we can handle anywhere. For the cases only legal
544   // on the SALU, RegBankSelect will be able to re-legalize.
545   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
546     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
547     .clampScalar(0, S32, S64)
548     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
549     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
550     .widenScalarToNextPow2(0)
551     .scalarize(0);
552 
553   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
554                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
555     .legalFor({{S32, S1}, {S32, S32}})
556     .minScalar(0, S32)
557     // TODO: .scalarize(0)
558     .lower();
559 
560   getActionDefinitionsBuilder(G_BITCAST)
561     // Don't worry about the size constraint.
562     .legalIf(all(isRegisterType(0), isRegisterType(1)))
563     .lower();
564 
565 
566   getActionDefinitionsBuilder(G_CONSTANT)
567     .legalFor({S1, S32, S64, S16, GlobalPtr,
568                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
569     .legalIf(isPointer(0))
570     .clampScalar(0, S32, S64)
571     .widenScalarToNextPow2(0);
572 
573   getActionDefinitionsBuilder(G_FCONSTANT)
574     .legalFor({S32, S64, S16})
575     .clampScalar(0, S16, S64);
576 
577   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
578       .legalIf(isRegisterType(0))
579       // s1 and s16 are special cases because they have legal operations on
580       // them, but don't really occupy registers in the normal way.
581       .legalFor({S1, S16})
582       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
583       .clampScalarOrElt(0, S32, MaxScalar)
584       .widenScalarToNextPow2(0, 32)
585       .clampMaxNumElements(0, S32, 16);
586 
587   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
588 
589   // If the amount is divergent, we have to do a wave reduction to get the
590   // maximum value, so this is expanded during RegBankSelect.
591   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
592     .legalFor({{PrivatePtr, S32}});
593 
594   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
595     .customIf(typeIsNot(0, PrivatePtr));
596 
597   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
598 
599   auto &FPOpActions = getActionDefinitionsBuilder(
600     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
601     .legalFor({S32, S64});
602   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
603     .customFor({S32, S64});
604   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
605     .customFor({S32, S64});
606 
607   if (ST.has16BitInsts()) {
608     if (ST.hasVOP3PInsts())
609       FPOpActions.legalFor({S16, V2S16});
610     else
611       FPOpActions.legalFor({S16});
612 
613     TrigActions.customFor({S16});
614     FDIVActions.customFor({S16});
615   }
616 
617   auto &MinNumMaxNum = getActionDefinitionsBuilder({
618       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
619 
620   if (ST.hasVOP3PInsts()) {
621     MinNumMaxNum.customFor(FPTypesPK16)
622       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
623       .clampMaxNumElements(0, S16, 2)
624       .clampScalar(0, S16, S64)
625       .scalarize(0);
626   } else if (ST.has16BitInsts()) {
627     MinNumMaxNum.customFor(FPTypes16)
628       .clampScalar(0, S16, S64)
629       .scalarize(0);
630   } else {
631     MinNumMaxNum.customFor(FPTypesBase)
632       .clampScalar(0, S32, S64)
633       .scalarize(0);
634   }
635 
636   if (ST.hasVOP3PInsts())
637     FPOpActions.clampMaxNumElements(0, S16, 2);
638 
639   FPOpActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   TrigActions
644     .scalarize(0)
645     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
646 
647   FDIVActions
648     .scalarize(0)
649     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
650 
651   getActionDefinitionsBuilder({G_FNEG, G_FABS})
652     .legalFor(FPTypesPK16)
653     .clampMaxNumElements(0, S16, 2)
654     .scalarize(0)
655     .clampScalar(0, S16, S64);
656 
657   if (ST.has16BitInsts()) {
658     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
659       .legalFor({S32, S64, S16})
660       .scalarize(0)
661       .clampScalar(0, S16, S64);
662   } else {
663     getActionDefinitionsBuilder(G_FSQRT)
664       .legalFor({S32, S64})
665       .scalarize(0)
666       .clampScalar(0, S32, S64);
667 
668     if (ST.hasFractBug()) {
669       getActionDefinitionsBuilder(G_FFLOOR)
670         .customFor({S64})
671         .legalFor({S32, S64})
672         .scalarize(0)
673         .clampScalar(0, S32, S64);
674     } else {
675       getActionDefinitionsBuilder(G_FFLOOR)
676         .legalFor({S32, S64})
677         .scalarize(0)
678         .clampScalar(0, S32, S64);
679     }
680   }
681 
682   getActionDefinitionsBuilder(G_FPTRUNC)
683     .legalFor({{S32, S64}, {S16, S32}})
684     .scalarize(0)
685     .lower();
686 
687   getActionDefinitionsBuilder(G_FPEXT)
688     .legalFor({{S64, S32}, {S32, S16}})
689     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
690     .scalarize(0);
691 
692   getActionDefinitionsBuilder(G_FSUB)
693       // Use actual fsub instruction
694       .legalFor({S32})
695       // Must use fadd + fneg
696       .lowerFor({S64, S16, V2S16})
697       .scalarize(0)
698       .clampScalar(0, S32, S64);
699 
700   // Whether this is legal depends on the floating point mode for the function.
701   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
702   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
703     FMad.customFor({S32, S16});
704   else if (ST.hasMadMacF32Insts())
705     FMad.customFor({S32});
706   else if (ST.hasMadF16())
707     FMad.customFor({S16});
708   FMad.scalarize(0)
709       .lower();
710 
711   // TODO: Do we need to clamp maximum bitwidth?
712   getActionDefinitionsBuilder(G_TRUNC)
713     .legalIf(isScalar(0))
714     .legalFor({{V2S16, V2S32}})
715     .clampMaxNumElements(0, S16, 2)
716     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
717     // situations (like an invalid implicit use), we don't want to infinite loop
718     // in the legalizer.
719     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
720     .alwaysLegal();
721 
722   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
723     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
724                {S32, S1}, {S64, S1}, {S16, S1}})
725     .scalarize(0)
726     .clampScalar(0, S32, S64)
727     .widenScalarToNextPow2(1, 32);
728 
729   // TODO: Split s1->s64 during regbankselect for VALU.
730   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
731     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
732     .lowerFor({{S32, S64}})
733     .lowerIf(typeIs(1, S1))
734     .customFor({{S64, S64}});
735   if (ST.has16BitInsts())
736     IToFP.legalFor({{S16, S16}});
737   IToFP.clampScalar(1, S32, S64)
738        .minScalar(0, S32)
739        .scalarize(0)
740        .widenScalarToNextPow2(1);
741 
742   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
743     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
744     .customFor({{S64, S64}})
745     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
746   if (ST.has16BitInsts())
747     FPToI.legalFor({{S16, S16}});
748   else
749     FPToI.minScalar(1, S32);
750 
751   FPToI.minScalar(0, S32)
752        .scalarize(0)
753        .lower();
754 
755   // Lower roundeven into G_FRINT
756   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
757     .scalarize(0)
758     .lower();
759 
760   if (ST.has16BitInsts()) {
761     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
762       .legalFor({S16, S32, S64})
763       .clampScalar(0, S16, S64)
764       .scalarize(0);
765   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
766     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
767       .legalFor({S32, S64})
768       .clampScalar(0, S32, S64)
769       .scalarize(0);
770   } else {
771     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
772       .legalFor({S32})
773       .customFor({S64})
774       .clampScalar(0, S32, S64)
775       .scalarize(0);
776   }
777 
778   getActionDefinitionsBuilder(G_PTR_ADD)
779     .legalIf(all(isPointer(0), sameSize(0, 1)))
780     .scalarize(0)
781     .scalarSameSizeAs(1, 0);
782 
783   getActionDefinitionsBuilder(G_PTRMASK)
784     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
785     .scalarSameSizeAs(1, 0)
786     .scalarize(0);
787 
788   auto &CmpBuilder =
789     getActionDefinitionsBuilder(G_ICMP)
790     // The compare output type differs based on the register bank of the output,
791     // so make both s1 and s32 legal.
792     //
793     // Scalar compares producing output in scc will be promoted to s32, as that
794     // is the allocatable register type that will be needed for the copy from
795     // scc. This will be promoted during RegBankSelect, and we assume something
796     // before that won't try to use s32 result types.
797     //
798     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
799     // bank.
800     .legalForCartesianProduct(
801       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
802     .legalForCartesianProduct(
803       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
804   if (ST.has16BitInsts()) {
805     CmpBuilder.legalFor({{S1, S16}});
806   }
807 
808   CmpBuilder
809     .widenScalarToNextPow2(1)
810     .clampScalar(1, S32, S64)
811     .scalarize(0)
812     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
813 
814   getActionDefinitionsBuilder(G_FCMP)
815     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
816     .widenScalarToNextPow2(1)
817     .clampScalar(1, S32, S64)
818     .scalarize(0);
819 
820   // FIXME: fpow has a selection pattern that should move to custom lowering.
821   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
822   if (ST.has16BitInsts())
823     Exp2Ops.legalFor({S32, S16});
824   else
825     Exp2Ops.legalFor({S32});
826   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
827   Exp2Ops.scalarize(0);
828 
829   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
830   if (ST.has16BitInsts())
831     ExpOps.customFor({{S32}, {S16}});
832   else
833     ExpOps.customFor({S32});
834   ExpOps.clampScalar(0, MinScalarFPTy, S32)
835         .scalarize(0);
836 
837   getActionDefinitionsBuilder(G_FPOWI)
838     .clampScalar(0, MinScalarFPTy, S32)
839     .lower();
840 
841   // The 64-bit versions produce 32-bit results, but only on the SALU.
842   getActionDefinitionsBuilder(G_CTPOP)
843     .legalFor({{S32, S32}, {S32, S64}})
844     .clampScalar(0, S32, S32)
845     .clampScalar(1, S32, S64)
846     .scalarize(0)
847     .widenScalarToNextPow2(0, 32)
848     .widenScalarToNextPow2(1, 32);
849 
850   // The hardware instructions return a different result on 0 than the generic
851   // instructions expect. The hardware produces -1, but these produce the
852   // bitwidth.
853   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
854     .scalarize(0)
855     .clampScalar(0, S32, S32)
856     .clampScalar(1, S32, S64)
857     .widenScalarToNextPow2(0, 32)
858     .widenScalarToNextPow2(1, 32)
859     .lower();
860 
861   // The 64-bit versions produce 32-bit results, but only on the SALU.
862   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
863     .legalFor({{S32, S32}, {S32, S64}})
864     .clampScalar(0, S32, S32)
865     .clampScalar(1, S32, S64)
866     .scalarize(0)
867     .widenScalarToNextPow2(0, 32)
868     .widenScalarToNextPow2(1, 32);
869 
870   getActionDefinitionsBuilder(G_BITREVERSE)
871     .legalFor({S32})
872     .clampScalar(0, S32, S32)
873     .scalarize(0);
874 
875   if (ST.has16BitInsts()) {
876     getActionDefinitionsBuilder(G_BSWAP)
877       .legalFor({S16, S32, V2S16})
878       .clampMaxNumElements(0, S16, 2)
879       // FIXME: Fixing non-power-of-2 before clamp is workaround for
880       // narrowScalar limitation.
881       .widenScalarToNextPow2(0)
882       .clampScalar(0, S16, S32)
883       .scalarize(0);
884 
885     if (ST.hasVOP3PInsts()) {
886       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
887         .legalFor({S32, S16, V2S16})
888         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
889         .clampMaxNumElements(0, S16, 2)
890         .minScalar(0, S16)
891         .widenScalarToNextPow2(0)
892         .scalarize(0)
893         .lower();
894     } else {
895       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
896         .legalFor({S32, S16})
897         .widenScalarToNextPow2(0)
898         .minScalar(0, S16)
899         .scalarize(0)
900         .lower();
901     }
902   } else {
903     // TODO: Should have same legality without v_perm_b32
904     getActionDefinitionsBuilder(G_BSWAP)
905       .legalFor({S32})
906       .lowerIf(scalarNarrowerThan(0, 32))
907       // FIXME: Fixing non-power-of-2 before clamp is workaround for
908       // narrowScalar limitation.
909       .widenScalarToNextPow2(0)
910       .maxScalar(0, S32)
911       .scalarize(0)
912       .lower();
913 
914     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
915       .legalFor({S32})
916       .minScalar(0, S32)
917       .widenScalarToNextPow2(0)
918       .scalarize(0)
919       .lower();
920   }
921 
922   getActionDefinitionsBuilder(G_INTTOPTR)
923     // List the common cases
924     .legalForCartesianProduct(AddrSpaces64, {S64})
925     .legalForCartesianProduct(AddrSpaces32, {S32})
926     .scalarize(0)
927     // Accept any address space as long as the size matches
928     .legalIf(sameSize(0, 1))
929     .widenScalarIf(smallerThan(1, 0),
930       [](const LegalityQuery &Query) {
931         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
932       })
933     .narrowScalarIf(largerThan(1, 0),
934       [](const LegalityQuery &Query) {
935         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
936       });
937 
938   getActionDefinitionsBuilder(G_PTRTOINT)
939     // List the common cases
940     .legalForCartesianProduct(AddrSpaces64, {S64})
941     .legalForCartesianProduct(AddrSpaces32, {S32})
942     .scalarize(0)
943     // Accept any address space as long as the size matches
944     .legalIf(sameSize(0, 1))
945     .widenScalarIf(smallerThan(0, 1),
946       [](const LegalityQuery &Query) {
947         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
948       })
949     .narrowScalarIf(
950       largerThan(0, 1),
951       [](const LegalityQuery &Query) {
952         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
953       });
954 
955   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
956     .scalarize(0)
957     .custom();
958 
959   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
960                                     bool IsLoad) -> bool {
961     const LLT DstTy = Query.Types[0];
962 
963     // Split vector extloads.
964     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
965     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
966 
967     if (MemSize < DstTy.getSizeInBits())
968       MemSize = std::max(MemSize, AlignBits);
969 
970     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
971       return true;
972 
973     const LLT PtrTy = Query.Types[1];
974     unsigned AS = PtrTy.getAddressSpace();
975     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
976       return true;
977 
978     // Catch weird sized loads that don't evenly divide into the access sizes
979     // TODO: May be able to widen depending on alignment etc.
980     unsigned NumRegs = (MemSize + 31) / 32;
981     if (NumRegs == 3) {
982       if (!ST.hasDwordx3LoadStores())
983         return true;
984     } else {
985       // If the alignment allows, these should have been widened.
986       if (!isPowerOf2_32(NumRegs))
987         return true;
988     }
989 
990     if (AlignBits < MemSize) {
991       const SITargetLowering *TLI = ST.getTargetLowering();
992       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
993                                                       Align(AlignBits / 8));
994     }
995 
996     return false;
997   };
998 
999   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1000                                          unsigned Opc) -> bool {
1001     unsigned Size = Query.Types[0].getSizeInBits();
1002     if (isPowerOf2_32(Size))
1003       return false;
1004 
1005     if (Size == 96 && ST.hasDwordx3LoadStores())
1006       return false;
1007 
1008     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1009     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1010       return false;
1011 
1012     unsigned Align = Query.MMODescrs[0].AlignInBits;
1013     unsigned RoundedSize = NextPowerOf2(Size);
1014     return (Align >= RoundedSize);
1015   };
1016 
1017   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1018   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1019   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1020 
1021   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1022   // LDS
1023   // TODO: Unsupported flat for SI.
1024 
1025   for (unsigned Op : {G_LOAD, G_STORE}) {
1026     const bool IsStore = Op == G_STORE;
1027 
1028     auto &Actions = getActionDefinitionsBuilder(Op);
1029     // Explicitly list some common cases.
1030     // TODO: Does this help compile time at all?
1031     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1032                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1033                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1034                                       {S64, GlobalPtr, 64, GlobalAlign32},
1035                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1036                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1037                                       {S32, GlobalPtr, 8, GlobalAlign8},
1038                                       {S32, GlobalPtr, 16, GlobalAlign16},
1039 
1040                                       {S32, LocalPtr, 32, 32},
1041                                       {S64, LocalPtr, 64, 32},
1042                                       {V2S32, LocalPtr, 64, 32},
1043                                       {S32, LocalPtr, 8, 8},
1044                                       {S32, LocalPtr, 16, 16},
1045                                       {V2S16, LocalPtr, 32, 32},
1046 
1047                                       {S32, PrivatePtr, 32, 32},
1048                                       {S32, PrivatePtr, 8, 8},
1049                                       {S32, PrivatePtr, 16, 16},
1050                                       {V2S16, PrivatePtr, 32, 32},
1051 
1052                                       {S32, ConstantPtr, 32, GlobalAlign32},
1053                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1054                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1055                                       {S64, ConstantPtr, 64, GlobalAlign32},
1056                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1057     Actions.legalIf(
1058       [=](const LegalityQuery &Query) -> bool {
1059         return isLoadStoreLegal(ST, Query, Op);
1060       });
1061 
1062     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1063     // 64-bits.
1064     //
1065     // TODO: Should generalize bitcast action into coerce, which will also cover
1066     // inserting addrspacecasts.
1067     Actions.customIf(typeIs(1, Constant32Ptr));
1068 
1069     // Turn any illegal element vectors into something easier to deal
1070     // with. These will ultimately produce 32-bit scalar shifts to extract the
1071     // parts anyway.
1072     //
1073     // For odd 16-bit element vectors, prefer to split those into pieces with
1074     // 16-bit vector parts.
1075     Actions.bitcastIf(
1076       [=](const LegalityQuery &Query) -> bool {
1077         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1078                                           Query.MMODescrs[0].SizeInBits);
1079       }, bitcastToRegisterType(0));
1080 
1081     Actions
1082         .customIf(typeIs(1, Constant32Ptr))
1083         // Widen suitably aligned loads by loading extra elements.
1084         .moreElementsIf([=](const LegalityQuery &Query) {
1085             const LLT Ty = Query.Types[0];
1086             return Op == G_LOAD && Ty.isVector() &&
1087                    shouldWidenLoadResult(Query, Op);
1088           }, moreElementsToNextPow2(0))
1089         .widenScalarIf([=](const LegalityQuery &Query) {
1090             const LLT Ty = Query.Types[0];
1091             return Op == G_LOAD && !Ty.isVector() &&
1092                    shouldWidenLoadResult(Query, Op);
1093           }, widenScalarOrEltToNextPow2(0))
1094         .narrowScalarIf(
1095             [=](const LegalityQuery &Query) -> bool {
1096               return !Query.Types[0].isVector() &&
1097                      needToSplitMemOp(Query, Op == G_LOAD);
1098             },
1099             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1100               const LLT DstTy = Query.Types[0];
1101               const LLT PtrTy = Query.Types[1];
1102 
1103               const unsigned DstSize = DstTy.getSizeInBits();
1104               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1105 
1106               // Split extloads.
1107               if (DstSize > MemSize)
1108                 return std::make_pair(0, LLT::scalar(MemSize));
1109 
1110               if (!isPowerOf2_32(DstSize)) {
1111                 // We're probably decomposing an odd sized store. Try to split
1112                 // to the widest type. TODO: Account for alignment. As-is it
1113                 // should be OK, since the new parts will be further legalized.
1114                 unsigned FloorSize = PowerOf2Floor(DstSize);
1115                 return std::make_pair(0, LLT::scalar(FloorSize));
1116               }
1117 
1118               if (DstSize > 32 && (DstSize % 32 != 0)) {
1119                 // FIXME: Need a way to specify non-extload of larger size if
1120                 // suitably aligned.
1121                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1122               }
1123 
1124               unsigned MaxSize = maxSizeForAddrSpace(ST,
1125                                                      PtrTy.getAddressSpace(),
1126                                                      Op == G_LOAD);
1127               if (MemSize > MaxSize)
1128                 return std::make_pair(0, LLT::scalar(MaxSize));
1129 
1130               unsigned Align = Query.MMODescrs[0].AlignInBits;
1131               return std::make_pair(0, LLT::scalar(Align));
1132             })
1133         .fewerElementsIf(
1134             [=](const LegalityQuery &Query) -> bool {
1135               return Query.Types[0].isVector() &&
1136                      needToSplitMemOp(Query, Op == G_LOAD);
1137             },
1138             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1139               const LLT DstTy = Query.Types[0];
1140               const LLT PtrTy = Query.Types[1];
1141 
1142               LLT EltTy = DstTy.getElementType();
1143               unsigned MaxSize = maxSizeForAddrSpace(ST,
1144                                                      PtrTy.getAddressSpace(),
1145                                                      Op == G_LOAD);
1146 
1147               // FIXME: Handle widened to power of 2 results better. This ends
1148               // up scalarizing.
1149               // FIXME: 3 element stores scalarized on SI
1150 
1151               // Split if it's too large for the address space.
1152               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1153                 unsigned NumElts = DstTy.getNumElements();
1154                 unsigned EltSize = EltTy.getSizeInBits();
1155 
1156                 if (MaxSize % EltSize == 0) {
1157                   return std::make_pair(
1158                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1159                 }
1160 
1161                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1162 
1163                 // FIXME: Refine when odd breakdowns handled
1164                 // The scalars will need to be re-legalized.
1165                 if (NumPieces == 1 || NumPieces >= NumElts ||
1166                     NumElts % NumPieces != 0)
1167                   return std::make_pair(0, EltTy);
1168 
1169                 return std::make_pair(0,
1170                                       LLT::vector(NumElts / NumPieces, EltTy));
1171               }
1172 
1173               // FIXME: We could probably handle weird extending loads better.
1174               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1175               if (DstTy.getSizeInBits() > MemSize)
1176                 return std::make_pair(0, EltTy);
1177 
1178               unsigned EltSize = EltTy.getSizeInBits();
1179               unsigned DstSize = DstTy.getSizeInBits();
1180               if (!isPowerOf2_32(DstSize)) {
1181                 // We're probably decomposing an odd sized store. Try to split
1182                 // to the widest type. TODO: Account for alignment. As-is it
1183                 // should be OK, since the new parts will be further legalized.
1184                 unsigned FloorSize = PowerOf2Floor(DstSize);
1185                 return std::make_pair(
1186                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1187               }
1188 
1189               // Need to split because of alignment.
1190               unsigned Align = Query.MMODescrs[0].AlignInBits;
1191               if (EltSize > Align &&
1192                   (EltSize / Align < DstTy.getNumElements())) {
1193                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1194               }
1195 
1196               // May need relegalization for the scalars.
1197               return std::make_pair(0, EltTy);
1198             })
1199         .minScalar(0, S32);
1200 
1201     if (IsStore)
1202       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1203 
1204     // TODO: Need a bitcast lower option?
1205     Actions
1206         .widenScalarToNextPow2(0)
1207         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1208   }
1209 
1210   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1211                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1212                                                   {S32, GlobalPtr, 16, 2 * 8},
1213                                                   {S32, LocalPtr, 8, 8},
1214                                                   {S32, LocalPtr, 16, 16},
1215                                                   {S32, PrivatePtr, 8, 8},
1216                                                   {S32, PrivatePtr, 16, 16},
1217                                                   {S32, ConstantPtr, 8, 8},
1218                                                   {S32, ConstantPtr, 16, 2 * 8}});
1219   if (ST.hasFlatAddressSpace()) {
1220     ExtLoads.legalForTypesWithMemDesc(
1221         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1222   }
1223 
1224   ExtLoads.clampScalar(0, S32, S32)
1225           .widenScalarToNextPow2(0)
1226           .unsupportedIfMemSizeNotPow2()
1227           .lower();
1228 
1229   auto &Atomics = getActionDefinitionsBuilder(
1230     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1231      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1232      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1233      G_ATOMICRMW_UMIN})
1234     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1235                {S64, GlobalPtr}, {S64, LocalPtr},
1236                {S32, RegionPtr}, {S64, RegionPtr}});
1237   if (ST.hasFlatAddressSpace()) {
1238     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1239   }
1240 
1241   if (ST.hasLDSFPAtomics()) {
1242     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1243       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1244   }
1245 
1246   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1247   // demarshalling
1248   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1249     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1250                 {S32, FlatPtr}, {S64, FlatPtr}})
1251     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1252                {S32, RegionPtr}, {S64, RegionPtr}});
1253   // TODO: Pointer types, any 32-bit or 64-bit vector
1254 
1255   // Condition should be s32 for scalar, s1 for vector.
1256   getActionDefinitionsBuilder(G_SELECT)
1257     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1258           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1259           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1260     .clampScalar(0, S16, S64)
1261     .scalarize(1)
1262     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1263     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1264     .clampMaxNumElements(0, S32, 2)
1265     .clampMaxNumElements(0, LocalPtr, 2)
1266     .clampMaxNumElements(0, PrivatePtr, 2)
1267     .scalarize(0)
1268     .widenScalarToNextPow2(0)
1269     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1270 
1271   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1272   // be more flexible with the shift amount type.
1273   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1274     .legalFor({{S32, S32}, {S64, S32}});
1275   if (ST.has16BitInsts()) {
1276     if (ST.hasVOP3PInsts()) {
1277       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1278             .clampMaxNumElements(0, S16, 2);
1279     } else
1280       Shifts.legalFor({{S16, S16}});
1281 
1282     // TODO: Support 16-bit shift amounts for all types
1283     Shifts.widenScalarIf(
1284       [=](const LegalityQuery &Query) {
1285         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1286         // 32-bit amount.
1287         const LLT ValTy = Query.Types[0];
1288         const LLT AmountTy = Query.Types[1];
1289         return ValTy.getSizeInBits() <= 16 &&
1290                AmountTy.getSizeInBits() < 16;
1291       }, changeTo(1, S16));
1292     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1293     Shifts.clampScalar(1, S32, S32);
1294     Shifts.clampScalar(0, S16, S64);
1295     Shifts.widenScalarToNextPow2(0, 16);
1296   } else {
1297     // Make sure we legalize the shift amount type first, as the general
1298     // expansion for the shifted type will produce much worse code if it hasn't
1299     // been truncated already.
1300     Shifts.clampScalar(1, S32, S32);
1301     Shifts.clampScalar(0, S32, S64);
1302     Shifts.widenScalarToNextPow2(0, 32);
1303   }
1304   Shifts.scalarize(0);
1305 
1306   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1307     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1308     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1309     unsigned IdxTypeIdx = 2;
1310 
1311     getActionDefinitionsBuilder(Op)
1312       .customIf([=](const LegalityQuery &Query) {
1313           const LLT EltTy = Query.Types[EltTypeIdx];
1314           const LLT VecTy = Query.Types[VecTypeIdx];
1315           const LLT IdxTy = Query.Types[IdxTypeIdx];
1316           const unsigned EltSize = EltTy.getSizeInBits();
1317           return (EltSize == 32 || EltSize == 64) &&
1318                   VecTy.getSizeInBits() % 32 == 0 &&
1319                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1320                   IdxTy.getSizeInBits() == 32;
1321         })
1322       .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
1323                  bitcastToVectorElement32(1))
1324       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1325       .bitcastIf(
1326         all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
1327         [=](const LegalityQuery &Query) {
1328           // For > 64-bit element types, try to turn this into a 64-bit
1329           // element vector since we may be able to do better indexing
1330           // if this is scalar. If not, fall back to 32.
1331           const LLT EltTy = Query.Types[EltTypeIdx];
1332           const LLT VecTy = Query.Types[VecTypeIdx];
1333           const unsigned DstEltSize = EltTy.getSizeInBits();
1334           const unsigned VecSize = VecTy.getSizeInBits();
1335 
1336           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1337           return std::make_pair(
1338             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1339         })
1340       .clampScalar(EltTypeIdx, S32, S64)
1341       .clampScalar(VecTypeIdx, S32, S64)
1342       .clampScalar(IdxTypeIdx, S32, S32)
1343       .clampMaxNumElements(1, S32, 32)
1344       // TODO: Clamp elements for 64-bit vectors?
1345       // It should only be necessary with variable indexes.
1346       // As a last resort, lower to the stack
1347       .lower();
1348   }
1349 
1350   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1351     .unsupportedIf([=](const LegalityQuery &Query) {
1352         const LLT &EltTy = Query.Types[1].getElementType();
1353         return Query.Types[0] != EltTy;
1354       });
1355 
1356   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1357     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1358     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1359 
1360     // FIXME: Doesn't handle extract of illegal sizes.
1361     getActionDefinitionsBuilder(Op)
1362       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1363       // FIXME: Multiples of 16 should not be legal.
1364       .legalIf([=](const LegalityQuery &Query) {
1365           const LLT BigTy = Query.Types[BigTyIdx];
1366           const LLT LitTy = Query.Types[LitTyIdx];
1367           return (BigTy.getSizeInBits() % 32 == 0) &&
1368                  (LitTy.getSizeInBits() % 16 == 0);
1369         })
1370       .widenScalarIf(
1371         [=](const LegalityQuery &Query) {
1372           const LLT BigTy = Query.Types[BigTyIdx];
1373           return (BigTy.getScalarSizeInBits() < 16);
1374         },
1375         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1376       .widenScalarIf(
1377         [=](const LegalityQuery &Query) {
1378           const LLT LitTy = Query.Types[LitTyIdx];
1379           return (LitTy.getScalarSizeInBits() < 16);
1380         },
1381         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1382       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1383       .widenScalarToNextPow2(BigTyIdx, 32);
1384 
1385   }
1386 
1387   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1388     .legalForCartesianProduct(AllS32Vectors, {S32})
1389     .legalForCartesianProduct(AllS64Vectors, {S64})
1390     .clampNumElements(0, V16S32, V32S32)
1391     .clampNumElements(0, V2S64, V16S64)
1392     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1393 
1394   if (ST.hasScalarPackInsts()) {
1395     BuildVector
1396       // FIXME: Should probably widen s1 vectors straight to s32
1397       .minScalarOrElt(0, S16)
1398       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1399       .minScalar(1, S32);
1400 
1401     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1402       .legalFor({V2S16, S32})
1403       .lower();
1404     BuildVector.minScalarOrElt(0, S32);
1405   } else {
1406     BuildVector.customFor({V2S16, S16});
1407     BuildVector.minScalarOrElt(0, S32);
1408 
1409     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1410       .customFor({V2S16, S32})
1411       .lower();
1412   }
1413 
1414   BuildVector.legalIf(isRegisterType(0));
1415 
1416   // FIXME: Clamp maximum size
1417   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1418     .legalIf(isRegisterType(0));
1419 
1420   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1421   // pre-legalize.
1422   if (ST.hasVOP3PInsts()) {
1423     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1424       .customFor({V2S16, V2S16})
1425       .lower();
1426   } else
1427     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1428 
1429   // Merge/Unmerge
1430   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1431     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1432     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1433 
1434     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1435       const LLT Ty = Query.Types[TypeIdx];
1436       if (Ty.isVector()) {
1437         const LLT &EltTy = Ty.getElementType();
1438         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1439           return true;
1440         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1441           return true;
1442       }
1443       return false;
1444     };
1445 
1446     auto &Builder = getActionDefinitionsBuilder(Op)
1447       .lowerFor({{S16, V2S16}})
1448       .lowerIf([=](const LegalityQuery &Query) {
1449           const LLT BigTy = Query.Types[BigTyIdx];
1450           return BigTy.getSizeInBits() == 32;
1451         })
1452       // Try to widen to s16 first for small types.
1453       // TODO: Only do this on targets with legal s16 shifts
1454       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1455       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1456       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1457       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1458                            elementTypeIs(1, S16)),
1459                        changeTo(1, V2S16))
1460       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1461       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1462       // valid.
1463       .clampScalar(LitTyIdx, S32, S512)
1464       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1465       // Break up vectors with weird elements into scalars
1466       .fewerElementsIf(
1467         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1468         scalarize(0))
1469       .fewerElementsIf(
1470         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1471         scalarize(1))
1472       .clampScalar(BigTyIdx, S32, MaxScalar);
1473 
1474     if (Op == G_MERGE_VALUES) {
1475       Builder.widenScalarIf(
1476         // TODO: Use 16-bit shifts if legal for 8-bit values?
1477         [=](const LegalityQuery &Query) {
1478           const LLT Ty = Query.Types[LitTyIdx];
1479           return Ty.getSizeInBits() < 32;
1480         },
1481         changeTo(LitTyIdx, S32));
1482     }
1483 
1484     Builder.widenScalarIf(
1485       [=](const LegalityQuery &Query) {
1486         const LLT Ty = Query.Types[BigTyIdx];
1487         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1488           Ty.getSizeInBits() % 16 != 0;
1489       },
1490       [=](const LegalityQuery &Query) {
1491         // Pick the next power of 2, or a multiple of 64 over 128.
1492         // Whichever is smaller.
1493         const LLT &Ty = Query.Types[BigTyIdx];
1494         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1495         if (NewSizeInBits >= 256) {
1496           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1497           if (RoundedTo < NewSizeInBits)
1498             NewSizeInBits = RoundedTo;
1499         }
1500         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1501       })
1502       .legalIf([=](const LegalityQuery &Query) {
1503           const LLT &BigTy = Query.Types[BigTyIdx];
1504           const LLT &LitTy = Query.Types[LitTyIdx];
1505 
1506           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1507             return false;
1508           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1509             return false;
1510 
1511           return BigTy.getSizeInBits() % 16 == 0 &&
1512                  LitTy.getSizeInBits() % 16 == 0 &&
1513                  BigTy.getSizeInBits() <= MaxRegisterSize;
1514         })
1515       // Any vectors left are the wrong size. Scalarize them.
1516       .scalarize(0)
1517       .scalarize(1);
1518   }
1519 
1520   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1521   // RegBankSelect.
1522   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1523     .legalFor({{S32}, {S64}});
1524 
1525   if (ST.hasVOP3PInsts()) {
1526     SextInReg.lowerFor({{V2S16}})
1527       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1528       // get more vector shift opportunities, since we'll get those when
1529       // expanded.
1530       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1531   } else if (ST.has16BitInsts()) {
1532     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1533   } else {
1534     // Prefer to promote to s32 before lowering if we don't have 16-bit
1535     // shifts. This avoid a lot of intermediate truncate and extend operations.
1536     SextInReg.lowerFor({{S32}, {S64}});
1537   }
1538 
1539   SextInReg
1540     .scalarize(0)
1541     .clampScalar(0, S32, S64)
1542     .lower();
1543 
1544   getActionDefinitionsBuilder(G_FSHR)
1545     .legalFor({{S32, S32}})
1546     .scalarize(0)
1547     .lower();
1548 
1549   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1550     .legalFor({S64});
1551 
1552   getActionDefinitionsBuilder(G_FENCE)
1553     .alwaysLegal();
1554 
1555   getActionDefinitionsBuilder({
1556       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1557       G_FCOPYSIGN,
1558 
1559       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1560       G_ATOMICRMW_NAND,
1561       G_ATOMICRMW_FSUB,
1562       G_READ_REGISTER,
1563       G_WRITE_REGISTER,
1564 
1565       G_SADDO, G_SSUBO,
1566 
1567        // TODO: Implement
1568       G_FMINIMUM, G_FMAXIMUM,
1569       G_FSHL
1570     }).lower();
1571 
1572   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1573         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1574         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1575     .unsupported();
1576 
1577   computeTables();
1578   verify(*ST.getInstrInfo());
1579 }
1580 
1581 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1582                                          MachineInstr &MI) const {
1583   MachineIRBuilder &B = Helper.MIRBuilder;
1584   MachineRegisterInfo &MRI = *B.getMRI();
1585   GISelChangeObserver &Observer = Helper.Observer;
1586 
1587   switch (MI.getOpcode()) {
1588   case TargetOpcode::G_ADDRSPACE_CAST:
1589     return legalizeAddrSpaceCast(MI, MRI, B);
1590   case TargetOpcode::G_FRINT:
1591     return legalizeFrint(MI, MRI, B);
1592   case TargetOpcode::G_FCEIL:
1593     return legalizeFceil(MI, MRI, B);
1594   case TargetOpcode::G_INTRINSIC_TRUNC:
1595     return legalizeIntrinsicTrunc(MI, MRI, B);
1596   case TargetOpcode::G_SITOFP:
1597     return legalizeITOFP(MI, MRI, B, true);
1598   case TargetOpcode::G_UITOFP:
1599     return legalizeITOFP(MI, MRI, B, false);
1600   case TargetOpcode::G_FPTOSI:
1601     return legalizeFPTOI(MI, MRI, B, true);
1602   case TargetOpcode::G_FPTOUI:
1603     return legalizeFPTOI(MI, MRI, B, false);
1604   case TargetOpcode::G_FMINNUM:
1605   case TargetOpcode::G_FMAXNUM:
1606   case TargetOpcode::G_FMINNUM_IEEE:
1607   case TargetOpcode::G_FMAXNUM_IEEE:
1608     return legalizeMinNumMaxNum(Helper, MI);
1609   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1610     return legalizeExtractVectorElt(MI, MRI, B);
1611   case TargetOpcode::G_INSERT_VECTOR_ELT:
1612     return legalizeInsertVectorElt(MI, MRI, B);
1613   case TargetOpcode::G_SHUFFLE_VECTOR:
1614     return legalizeShuffleVector(MI, MRI, B);
1615   case TargetOpcode::G_FSIN:
1616   case TargetOpcode::G_FCOS:
1617     return legalizeSinCos(MI, MRI, B);
1618   case TargetOpcode::G_GLOBAL_VALUE:
1619     return legalizeGlobalValue(MI, MRI, B);
1620   case TargetOpcode::G_LOAD:
1621     return legalizeLoad(MI, MRI, B, Observer);
1622   case TargetOpcode::G_FMAD:
1623     return legalizeFMad(MI, MRI, B);
1624   case TargetOpcode::G_FDIV:
1625     return legalizeFDIV(MI, MRI, B);
1626   case TargetOpcode::G_UDIV:
1627   case TargetOpcode::G_UREM:
1628     return legalizeUDIV_UREM(MI, MRI, B);
1629   case TargetOpcode::G_SDIV:
1630   case TargetOpcode::G_SREM:
1631     return legalizeSDIV_SREM(MI, MRI, B);
1632   case TargetOpcode::G_ATOMIC_CMPXCHG:
1633     return legalizeAtomicCmpXChg(MI, MRI, B);
1634   case TargetOpcode::G_FLOG:
1635     return legalizeFlog(MI, B, numbers::ln2f);
1636   case TargetOpcode::G_FLOG10:
1637     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1638   case TargetOpcode::G_FEXP:
1639     return legalizeFExp(MI, B);
1640   case TargetOpcode::G_FPOW:
1641     return legalizeFPow(MI, B);
1642   case TargetOpcode::G_FFLOOR:
1643     return legalizeFFloor(MI, MRI, B);
1644   case TargetOpcode::G_BUILD_VECTOR:
1645     return legalizeBuildVector(MI, MRI, B);
1646   default:
1647     return false;
1648   }
1649 
1650   llvm_unreachable("expected switch to return");
1651 }
1652 
1653 Register AMDGPULegalizerInfo::getSegmentAperture(
1654   unsigned AS,
1655   MachineRegisterInfo &MRI,
1656   MachineIRBuilder &B) const {
1657   MachineFunction &MF = B.getMF();
1658   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1659   const LLT S32 = LLT::scalar(32);
1660 
1661   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1662 
1663   if (ST.hasApertureRegs()) {
1664     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1665     // getreg.
1666     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1667         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1668         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1669     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1670         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1671         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1672     unsigned Encoding =
1673         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1674         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1675         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1676 
1677     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1678 
1679     B.buildInstr(AMDGPU::S_GETREG_B32)
1680       .addDef(GetReg)
1681       .addImm(Encoding);
1682     MRI.setType(GetReg, S32);
1683 
1684     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1685     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1686   }
1687 
1688   Register QueuePtr = MRI.createGenericVirtualRegister(
1689     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1690 
1691   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1692     return Register();
1693 
1694   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1695   // private_segment_aperture_base_hi.
1696   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1697 
1698   // TODO: can we be smarter about machine pointer info?
1699   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1700   MachineMemOperand *MMO = MF.getMachineMemOperand(
1701       PtrInfo,
1702       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1703           MachineMemOperand::MOInvariant,
1704       4, commonAlignment(Align(64), StructOffset));
1705 
1706   Register LoadAddr;
1707 
1708   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1709   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1710 }
1711 
1712 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1713   MachineInstr &MI, MachineRegisterInfo &MRI,
1714   MachineIRBuilder &B) const {
1715   MachineFunction &MF = B.getMF();
1716 
1717   const LLT S32 = LLT::scalar(32);
1718   Register Dst = MI.getOperand(0).getReg();
1719   Register Src = MI.getOperand(1).getReg();
1720 
1721   LLT DstTy = MRI.getType(Dst);
1722   LLT SrcTy = MRI.getType(Src);
1723   unsigned DestAS = DstTy.getAddressSpace();
1724   unsigned SrcAS = SrcTy.getAddressSpace();
1725 
1726   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1727   // vector element.
1728   assert(!DstTy.isVector());
1729 
1730   const AMDGPUTargetMachine &TM
1731     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1732 
1733   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1734     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1735     return true;
1736   }
1737 
1738   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1739     // Truncate.
1740     B.buildExtract(Dst, Src, 0);
1741     MI.eraseFromParent();
1742     return true;
1743   }
1744 
1745   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1746     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1747     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1748 
1749     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1750     // another. Merge operands are required to be the same type, but creating an
1751     // extra ptrtoint would be kind of pointless.
1752     auto HighAddr = B.buildConstant(
1753       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1754     B.buildMerge(Dst, {Src, HighAddr});
1755     MI.eraseFromParent();
1756     return true;
1757   }
1758 
1759   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1760     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1761            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1762     unsigned NullVal = TM.getNullPointerValue(DestAS);
1763 
1764     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1765     auto FlatNull = B.buildConstant(SrcTy, 0);
1766 
1767     // Extract low 32-bits of the pointer.
1768     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1769 
1770     auto CmpRes =
1771         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1772     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1773 
1774     MI.eraseFromParent();
1775     return true;
1776   }
1777 
1778   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1779     return false;
1780 
1781   if (!ST.hasFlatAddressSpace())
1782     return false;
1783 
1784   auto SegmentNull =
1785       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1786   auto FlatNull =
1787       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1788 
1789   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1790   if (!ApertureReg.isValid())
1791     return false;
1792 
1793   auto CmpRes =
1794       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1795 
1796   // Coerce the type of the low half of the result so we can use merge_values.
1797   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1798 
1799   // TODO: Should we allow mismatched types but matching sizes in merges to
1800   // avoid the ptrtoint?
1801   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1802   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1803 
1804   MI.eraseFromParent();
1805   return true;
1806 }
1807 
1808 bool AMDGPULegalizerInfo::legalizeFrint(
1809   MachineInstr &MI, MachineRegisterInfo &MRI,
1810   MachineIRBuilder &B) const {
1811   Register Src = MI.getOperand(1).getReg();
1812   LLT Ty = MRI.getType(Src);
1813   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1814 
1815   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1816   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1817 
1818   auto C1 = B.buildFConstant(Ty, C1Val);
1819   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1820 
1821   // TODO: Should this propagate fast-math-flags?
1822   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1823   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1824 
1825   auto C2 = B.buildFConstant(Ty, C2Val);
1826   auto Fabs = B.buildFAbs(Ty, Src);
1827 
1828   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1829   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1830   MI.eraseFromParent();
1831   return true;
1832 }
1833 
1834 bool AMDGPULegalizerInfo::legalizeFceil(
1835   MachineInstr &MI, MachineRegisterInfo &MRI,
1836   MachineIRBuilder &B) const {
1837 
1838   const LLT S1 = LLT::scalar(1);
1839   const LLT S64 = LLT::scalar(64);
1840 
1841   Register Src = MI.getOperand(1).getReg();
1842   assert(MRI.getType(Src) == S64);
1843 
1844   // result = trunc(src)
1845   // if (src > 0.0 && src != result)
1846   //   result += 1.0
1847 
1848   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1849 
1850   const auto Zero = B.buildFConstant(S64, 0.0);
1851   const auto One = B.buildFConstant(S64, 1.0);
1852   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1853   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1854   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1855   auto Add = B.buildSelect(S64, And, One, Zero);
1856 
1857   // TODO: Should this propagate fast-math-flags?
1858   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1859   return true;
1860 }
1861 
1862 static MachineInstrBuilder extractF64Exponent(Register Hi,
1863                                               MachineIRBuilder &B) {
1864   const unsigned FractBits = 52;
1865   const unsigned ExpBits = 11;
1866   LLT S32 = LLT::scalar(32);
1867 
1868   auto Const0 = B.buildConstant(S32, FractBits - 32);
1869   auto Const1 = B.buildConstant(S32, ExpBits);
1870 
1871   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1872     .addUse(Hi)
1873     .addUse(Const0.getReg(0))
1874     .addUse(Const1.getReg(0));
1875 
1876   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1877 }
1878 
1879 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1880   MachineInstr &MI, MachineRegisterInfo &MRI,
1881   MachineIRBuilder &B) const {
1882   const LLT S1 = LLT::scalar(1);
1883   const LLT S32 = LLT::scalar(32);
1884   const LLT S64 = LLT::scalar(64);
1885 
1886   Register Src = MI.getOperand(1).getReg();
1887   assert(MRI.getType(Src) == S64);
1888 
1889   // TODO: Should this use extract since the low half is unused?
1890   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1891   Register Hi = Unmerge.getReg(1);
1892 
1893   // Extract the upper half, since this is where we will find the sign and
1894   // exponent.
1895   auto Exp = extractF64Exponent(Hi, B);
1896 
1897   const unsigned FractBits = 52;
1898 
1899   // Extract the sign bit.
1900   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1901   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1902 
1903   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1904 
1905   const auto Zero32 = B.buildConstant(S32, 0);
1906 
1907   // Extend back to 64-bits.
1908   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1909 
1910   auto Shr = B.buildAShr(S64, FractMask, Exp);
1911   auto Not = B.buildNot(S64, Shr);
1912   auto Tmp0 = B.buildAnd(S64, Src, Not);
1913   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1914 
1915   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1916   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1917 
1918   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1919   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1920   MI.eraseFromParent();
1921   return true;
1922 }
1923 
1924 bool AMDGPULegalizerInfo::legalizeITOFP(
1925   MachineInstr &MI, MachineRegisterInfo &MRI,
1926   MachineIRBuilder &B, bool Signed) const {
1927 
1928   Register Dst = MI.getOperand(0).getReg();
1929   Register Src = MI.getOperand(1).getReg();
1930 
1931   const LLT S64 = LLT::scalar(64);
1932   const LLT S32 = LLT::scalar(32);
1933 
1934   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1935 
1936   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1937 
1938   auto CvtHi = Signed ?
1939     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1940     B.buildUITOFP(S64, Unmerge.getReg(1));
1941 
1942   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1943 
1944   auto ThirtyTwo = B.buildConstant(S32, 32);
1945   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1946     .addUse(CvtHi.getReg(0))
1947     .addUse(ThirtyTwo.getReg(0));
1948 
1949   // TODO: Should this propagate fast-math-flags?
1950   B.buildFAdd(Dst, LdExp, CvtLo);
1951   MI.eraseFromParent();
1952   return true;
1953 }
1954 
1955 // TODO: Copied from DAG implementation. Verify logic and document how this
1956 // actually works.
1957 bool AMDGPULegalizerInfo::legalizeFPTOI(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B, bool Signed) const {
1960 
1961   Register Dst = MI.getOperand(0).getReg();
1962   Register Src = MI.getOperand(1).getReg();
1963 
1964   const LLT S64 = LLT::scalar(64);
1965   const LLT S32 = LLT::scalar(32);
1966 
1967   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1968 
1969   unsigned Flags = MI.getFlags();
1970 
1971   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1972   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1973   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1974 
1975   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1976   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1977   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1978 
1979   auto Hi = Signed ?
1980     B.buildFPTOSI(S32, FloorMul) :
1981     B.buildFPTOUI(S32, FloorMul);
1982   auto Lo = B.buildFPTOUI(S32, Fma);
1983 
1984   B.buildMerge(Dst, { Lo, Hi });
1985   MI.eraseFromParent();
1986 
1987   return true;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1991                                                MachineInstr &MI) const {
1992   MachineFunction &MF = Helper.MIRBuilder.getMF();
1993   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1994 
1995   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1996                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1997 
1998   // With ieee_mode disabled, the instructions have the correct behavior
1999   // already for G_FMINNUM/G_FMAXNUM
2000   if (!MFI->getMode().IEEE)
2001     return !IsIEEEOp;
2002 
2003   if (IsIEEEOp)
2004     return true;
2005 
2006   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2007 }
2008 
2009 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2010   MachineInstr &MI, MachineRegisterInfo &MRI,
2011   MachineIRBuilder &B) const {
2012   // TODO: Should move some of this into LegalizerHelper.
2013 
2014   // TODO: Promote dynamic indexing of s16 to s32
2015 
2016   // FIXME: Artifact combiner probably should have replaced the truncated
2017   // constant before this, so we shouldn't need
2018   // getConstantVRegValWithLookThrough.
2019   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2020     MI.getOperand(2).getReg(), MRI);
2021   if (!IdxVal) // Dynamic case will be selected to register indexing.
2022     return true;
2023 
2024   Register Dst = MI.getOperand(0).getReg();
2025   Register Vec = MI.getOperand(1).getReg();
2026 
2027   LLT VecTy = MRI.getType(Vec);
2028   LLT EltTy = VecTy.getElementType();
2029   assert(EltTy == MRI.getType(Dst));
2030 
2031   if (IdxVal->Value < VecTy.getNumElements())
2032     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2033   else
2034     B.buildUndef(Dst);
2035 
2036   MI.eraseFromParent();
2037   return true;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2041   MachineInstr &MI, MachineRegisterInfo &MRI,
2042   MachineIRBuilder &B) const {
2043   // TODO: Should move some of this into LegalizerHelper.
2044 
2045   // TODO: Promote dynamic indexing of s16 to s32
2046 
2047   // FIXME: Artifact combiner probably should have replaced the truncated
2048   // constant before this, so we shouldn't need
2049   // getConstantVRegValWithLookThrough.
2050   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2051     MI.getOperand(3).getReg(), MRI);
2052   if (!IdxVal) // Dynamic case will be selected to register indexing.
2053     return true;
2054 
2055   Register Dst = MI.getOperand(0).getReg();
2056   Register Vec = MI.getOperand(1).getReg();
2057   Register Ins = MI.getOperand(2).getReg();
2058 
2059   LLT VecTy = MRI.getType(Vec);
2060   LLT EltTy = VecTy.getElementType();
2061   assert(EltTy == MRI.getType(Ins));
2062 
2063   if (IdxVal->Value < VecTy.getNumElements())
2064     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2065   else
2066     B.buildUndef(Dst);
2067 
2068   MI.eraseFromParent();
2069   return true;
2070 }
2071 
2072 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2073   MachineInstr &MI, MachineRegisterInfo &MRI,
2074   MachineIRBuilder &B) const {
2075   const LLT V2S16 = LLT::vector(2, 16);
2076 
2077   Register Dst = MI.getOperand(0).getReg();
2078   Register Src0 = MI.getOperand(1).getReg();
2079   LLT DstTy = MRI.getType(Dst);
2080   LLT SrcTy = MRI.getType(Src0);
2081 
2082   if (SrcTy == V2S16 && DstTy == V2S16 &&
2083       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2084     return true;
2085 
2086   MachineIRBuilder HelperBuilder(MI);
2087   GISelObserverWrapper DummyObserver;
2088   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2089   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2090 }
2091 
2092 bool AMDGPULegalizerInfo::legalizeSinCos(
2093   MachineInstr &MI, MachineRegisterInfo &MRI,
2094   MachineIRBuilder &B) const {
2095 
2096   Register DstReg = MI.getOperand(0).getReg();
2097   Register SrcReg = MI.getOperand(1).getReg();
2098   LLT Ty = MRI.getType(DstReg);
2099   unsigned Flags = MI.getFlags();
2100 
2101   Register TrigVal;
2102   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2103   if (ST.hasTrigReducedRange()) {
2104     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2105     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2106       .addUse(MulVal.getReg(0))
2107       .setMIFlags(Flags).getReg(0);
2108   } else
2109     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2110 
2111   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2112     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2113   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2114     .addUse(TrigVal)
2115     .setMIFlags(Flags);
2116   MI.eraseFromParent();
2117   return true;
2118 }
2119 
2120 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2121                                                   MachineIRBuilder &B,
2122                                                   const GlobalValue *GV,
2123                                                   int64_t Offset,
2124                                                   unsigned GAFlags) const {
2125   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2126   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2127   // to the following code sequence:
2128   //
2129   // For constant address space:
2130   //   s_getpc_b64 s[0:1]
2131   //   s_add_u32 s0, s0, $symbol
2132   //   s_addc_u32 s1, s1, 0
2133   //
2134   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2135   //   a fixup or relocation is emitted to replace $symbol with a literal
2136   //   constant, which is a pc-relative offset from the encoding of the $symbol
2137   //   operand to the global variable.
2138   //
2139   // For global address space:
2140   //   s_getpc_b64 s[0:1]
2141   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2142   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2143   //
2144   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2145   //   fixups or relocations are emitted to replace $symbol@*@lo and
2146   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2147   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2148   //   operand to the global variable.
2149   //
2150   // What we want here is an offset from the value returned by s_getpc
2151   // (which is the address of the s_add_u32 instruction) to the global
2152   // variable, but since the encoding of $symbol starts 4 bytes after the start
2153   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2154   // small. This requires us to add 4 to the global variable offset in order to
2155   // compute the correct address.
2156 
2157   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2158 
2159   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2160     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2161 
2162   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2163     .addDef(PCReg);
2164 
2165   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2166   if (GAFlags == SIInstrInfo::MO_NONE)
2167     MIB.addImm(0);
2168   else
2169     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2170 
2171   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2172 
2173   if (PtrTy.getSizeInBits() == 32)
2174     B.buildExtract(DstReg, PCReg, 0);
2175   return true;
2176  }
2177 
2178 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2179   MachineInstr &MI, MachineRegisterInfo &MRI,
2180   MachineIRBuilder &B) const {
2181   Register DstReg = MI.getOperand(0).getReg();
2182   LLT Ty = MRI.getType(DstReg);
2183   unsigned AS = Ty.getAddressSpace();
2184 
2185   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2186   MachineFunction &MF = B.getMF();
2187   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2188 
2189   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2190     if (!MFI->isEntryFunction()) {
2191       const Function &Fn = MF.getFunction();
2192       DiagnosticInfoUnsupported BadLDSDecl(
2193         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2194         DS_Warning);
2195       Fn.getContext().diagnose(BadLDSDecl);
2196 
2197       // We currently don't have a way to correctly allocate LDS objects that
2198       // aren't directly associated with a kernel. We do force inlining of
2199       // functions that use local objects. However, if these dead functions are
2200       // not eliminated, we don't want a compile time error. Just emit a warning
2201       // and a trap, since there should be no callable path here.
2202       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2203       B.buildUndef(DstReg);
2204       MI.eraseFromParent();
2205       return true;
2206     }
2207 
2208     // TODO: We could emit code to handle the initialization somewhere.
2209     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2210       const SITargetLowering *TLI = ST.getTargetLowering();
2211       if (!TLI->shouldUseLDSConstAddress(GV)) {
2212         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2213         return true; // Leave in place;
2214       }
2215 
2216       B.buildConstant(
2217           DstReg,
2218           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2219       MI.eraseFromParent();
2220       return true;
2221     }
2222 
2223     const Function &Fn = MF.getFunction();
2224     DiagnosticInfoUnsupported BadInit(
2225       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2226     Fn.getContext().diagnose(BadInit);
2227     return true;
2228   }
2229 
2230   const SITargetLowering *TLI = ST.getTargetLowering();
2231 
2232   if (TLI->shouldEmitFixup(GV)) {
2233     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2234     MI.eraseFromParent();
2235     return true;
2236   }
2237 
2238   if (TLI->shouldEmitPCReloc(GV)) {
2239     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2240     MI.eraseFromParent();
2241     return true;
2242   }
2243 
2244   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2245   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2246 
2247   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2248       MachinePointerInfo::getGOT(MF),
2249       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2250           MachineMemOperand::MOInvariant,
2251       8 /*Size*/, Align(8));
2252 
2253   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2254 
2255   if (Ty.getSizeInBits() == 32) {
2256     // Truncate if this is a 32-bit constant adrdess.
2257     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2258     B.buildExtract(DstReg, Load, 0);
2259   } else
2260     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2261 
2262   MI.eraseFromParent();
2263   return true;
2264 }
2265 
2266 bool AMDGPULegalizerInfo::legalizeLoad(
2267   MachineInstr &MI, MachineRegisterInfo &MRI,
2268   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2269   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2270   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2271   Observer.changingInstr(MI);
2272   MI.getOperand(1).setReg(Cast.getReg(0));
2273   Observer.changedInstr(MI);
2274   return true;
2275 }
2276 
2277 bool AMDGPULegalizerInfo::legalizeFMad(
2278   MachineInstr &MI, MachineRegisterInfo &MRI,
2279   MachineIRBuilder &B) const {
2280   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2281   assert(Ty.isScalar());
2282 
2283   MachineFunction &MF = B.getMF();
2284   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2285 
2286   // TODO: Always legal with future ftz flag.
2287   // FIXME: Do we need just output?
2288   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2289     return true;
2290   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2291     return true;
2292 
2293   MachineIRBuilder HelperBuilder(MI);
2294   GISelObserverWrapper DummyObserver;
2295   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2296   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2297 }
2298 
2299 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2300   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2301   Register DstReg = MI.getOperand(0).getReg();
2302   Register PtrReg = MI.getOperand(1).getReg();
2303   Register CmpVal = MI.getOperand(2).getReg();
2304   Register NewVal = MI.getOperand(3).getReg();
2305 
2306   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2307          "this should not have been custom lowered");
2308 
2309   LLT ValTy = MRI.getType(CmpVal);
2310   LLT VecTy = LLT::vector(2, ValTy);
2311 
2312   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2313 
2314   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2315     .addDef(DstReg)
2316     .addUse(PtrReg)
2317     .addUse(PackedVal)
2318     .setMemRefs(MI.memoperands());
2319 
2320   MI.eraseFromParent();
2321   return true;
2322 }
2323 
2324 bool AMDGPULegalizerInfo::legalizeFlog(
2325   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2326   Register Dst = MI.getOperand(0).getReg();
2327   Register Src = MI.getOperand(1).getReg();
2328   LLT Ty = B.getMRI()->getType(Dst);
2329   unsigned Flags = MI.getFlags();
2330 
2331   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2332   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2333 
2334   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2335   MI.eraseFromParent();
2336   return true;
2337 }
2338 
2339 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2340                                        MachineIRBuilder &B) const {
2341   Register Dst = MI.getOperand(0).getReg();
2342   Register Src = MI.getOperand(1).getReg();
2343   unsigned Flags = MI.getFlags();
2344   LLT Ty = B.getMRI()->getType(Dst);
2345 
2346   auto K = B.buildFConstant(Ty, numbers::log2e);
2347   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2348   B.buildFExp2(Dst, Mul, Flags);
2349   MI.eraseFromParent();
2350   return true;
2351 }
2352 
2353 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2354                                        MachineIRBuilder &B) const {
2355   Register Dst = MI.getOperand(0).getReg();
2356   Register Src0 = MI.getOperand(1).getReg();
2357   Register Src1 = MI.getOperand(2).getReg();
2358   unsigned Flags = MI.getFlags();
2359   LLT Ty = B.getMRI()->getType(Dst);
2360   const LLT S16 = LLT::scalar(16);
2361   const LLT S32 = LLT::scalar(32);
2362 
2363   if (Ty == S32) {
2364     auto Log = B.buildFLog2(S32, Src0, Flags);
2365     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2366       .addUse(Log.getReg(0))
2367       .addUse(Src1)
2368       .setMIFlags(Flags);
2369     B.buildFExp2(Dst, Mul, Flags);
2370   } else if (Ty == S16) {
2371     // There's no f16 fmul_legacy, so we need to convert for it.
2372     auto Log = B.buildFLog2(S16, Src0, Flags);
2373     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2374     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2375     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2376       .addUse(Ext0.getReg(0))
2377       .addUse(Ext1.getReg(0))
2378       .setMIFlags(Flags);
2379 
2380     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2381   } else
2382     return false;
2383 
2384   MI.eraseFromParent();
2385   return true;
2386 }
2387 
2388 // Find a source register, ignoring any possible source modifiers.
2389 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2390   Register ModSrc = OrigSrc;
2391   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2392     ModSrc = SrcFNeg->getOperand(1).getReg();
2393     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2394       ModSrc = SrcFAbs->getOperand(1).getReg();
2395   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2396     ModSrc = SrcFAbs->getOperand(1).getReg();
2397   return ModSrc;
2398 }
2399 
2400 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2401                                          MachineRegisterInfo &MRI,
2402                                          MachineIRBuilder &B) const {
2403 
2404   const LLT S1 = LLT::scalar(1);
2405   const LLT S64 = LLT::scalar(64);
2406   Register Dst = MI.getOperand(0).getReg();
2407   Register OrigSrc = MI.getOperand(1).getReg();
2408   unsigned Flags = MI.getFlags();
2409   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2410          "this should not have been custom lowered");
2411 
2412   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2413   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2414   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2415   // V_FRACT bug is:
2416   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2417   //
2418   // Convert floor(x) to (x - fract(x))
2419 
2420   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2421     .addUse(OrigSrc)
2422     .setMIFlags(Flags);
2423 
2424   // Give source modifier matching some assistance before obscuring a foldable
2425   // pattern.
2426 
2427   // TODO: We can avoid the neg on the fract? The input sign to fract
2428   // shouldn't matter?
2429   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2430 
2431   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2432 
2433   Register Min = MRI.createGenericVirtualRegister(S64);
2434 
2435   // We don't need to concern ourselves with the snan handling difference, so
2436   // use the one which will directly select.
2437   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2438   if (MFI->getMode().IEEE)
2439     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2440   else
2441     B.buildFMinNum(Min, Fract, Const, Flags);
2442 
2443   Register CorrectedFract = Min;
2444   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2445     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2446     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2447   }
2448 
2449   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2450   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2451 
2452   MI.eraseFromParent();
2453   return true;
2454 }
2455 
2456 // Turn an illegal packed v2s16 build vector into bit operations.
2457 // TODO: This should probably be a bitcast action in LegalizerHelper.
2458 bool AMDGPULegalizerInfo::legalizeBuildVector(
2459   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2460   Register Dst = MI.getOperand(0).getReg();
2461   const LLT S32 = LLT::scalar(32);
2462   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2463 
2464   Register Src0 = MI.getOperand(1).getReg();
2465   Register Src1 = MI.getOperand(2).getReg();
2466   assert(MRI.getType(Src0) == LLT::scalar(16));
2467 
2468   auto Merge = B.buildMerge(S32, {Src0, Src1});
2469   B.buildBitcast(Dst, Merge);
2470 
2471   MI.eraseFromParent();
2472   return true;
2473 }
2474 
2475 // Return the use branch instruction, otherwise null if the usage is invalid.
2476 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2477                                        MachineRegisterInfo &MRI,
2478                                        MachineInstr *&Br,
2479                                        MachineBasicBlock *&UncondBrTarget) {
2480   Register CondDef = MI.getOperand(0).getReg();
2481   if (!MRI.hasOneNonDBGUse(CondDef))
2482     return nullptr;
2483 
2484   MachineBasicBlock *Parent = MI.getParent();
2485   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2486   if (UseMI.getParent() != Parent ||
2487       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2488     return nullptr;
2489 
2490   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2491   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2492   if (Next == Parent->end()) {
2493     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2494     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2495       return nullptr;
2496     UncondBrTarget = &*NextMBB;
2497   } else {
2498     if (Next->getOpcode() != AMDGPU::G_BR)
2499       return nullptr;
2500     Br = &*Next;
2501     UncondBrTarget = Br->getOperand(0).getMBB();
2502   }
2503 
2504   return &UseMI;
2505 }
2506 
2507 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2508                                          const ArgDescriptor *Arg,
2509                                          const TargetRegisterClass *ArgRC,
2510                                          LLT ArgTy) const {
2511   MCRegister SrcReg = Arg->getRegister();
2512   assert(SrcReg.isPhysical() && "Physical register expected");
2513   assert(DstReg.isVirtual() && "Virtual register expected");
2514 
2515   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2516                                              ArgTy);
2517   if (Arg->isMasked()) {
2518     // TODO: Should we try to emit this once in the entry block?
2519     const LLT S32 = LLT::scalar(32);
2520     const unsigned Mask = Arg->getMask();
2521     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2522 
2523     Register AndMaskSrc = LiveIn;
2524 
2525     if (Shift != 0) {
2526       auto ShiftAmt = B.buildConstant(S32, Shift);
2527       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2528     }
2529 
2530     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2531   } else {
2532     B.buildCopy(DstReg, LiveIn);
2533   }
2534 
2535   return true;
2536 }
2537 
2538 bool AMDGPULegalizerInfo::loadInputValue(
2539     Register DstReg, MachineIRBuilder &B,
2540     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2541   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2542   const ArgDescriptor *Arg;
2543   const TargetRegisterClass *ArgRC;
2544   LLT ArgTy;
2545   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2546 
2547   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2548     return false; // TODO: Handle these
2549   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2550 }
2551 
2552 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2553     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2554     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2555   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2556     return false;
2557 
2558   MI.eraseFromParent();
2559   return true;
2560 }
2561 
2562 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2563                                        MachineRegisterInfo &MRI,
2564                                        MachineIRBuilder &B) const {
2565   Register Dst = MI.getOperand(0).getReg();
2566   LLT DstTy = MRI.getType(Dst);
2567   LLT S16 = LLT::scalar(16);
2568   LLT S32 = LLT::scalar(32);
2569   LLT S64 = LLT::scalar(64);
2570 
2571   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2572     return true;
2573 
2574   if (DstTy == S16)
2575     return legalizeFDIV16(MI, MRI, B);
2576   if (DstTy == S32)
2577     return legalizeFDIV32(MI, MRI, B);
2578   if (DstTy == S64)
2579     return legalizeFDIV64(MI, MRI, B);
2580 
2581   return false;
2582 }
2583 
2584 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2585                                                   Register DstReg,
2586                                                   Register X,
2587                                                   Register Y,
2588                                                   bool IsDiv) const {
2589   const LLT S1 = LLT::scalar(1);
2590   const LLT S32 = LLT::scalar(32);
2591 
2592   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2593   // algorithm used here.
2594 
2595   // Initial estimate of inv(y).
2596   auto FloatY = B.buildUITOFP(S32, Y);
2597   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2598   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2599   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2600   auto Z = B.buildFPTOUI(S32, ScaledY);
2601 
2602   // One round of UNR.
2603   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2604   auto NegYZ = B.buildMul(S32, NegY, Z);
2605   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2606 
2607   // Quotient/remainder estimate.
2608   auto Q = B.buildUMulH(S32, X, Z);
2609   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2610 
2611   // First quotient/remainder refinement.
2612   auto One = B.buildConstant(S32, 1);
2613   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2614   if (IsDiv)
2615     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2616   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2617 
2618   // Second quotient/remainder refinement.
2619   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2620   if (IsDiv)
2621     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2622   else
2623     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2624 }
2625 
2626 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2627                                               MachineRegisterInfo &MRI,
2628                                               MachineIRBuilder &B) const {
2629   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2630   Register DstReg = MI.getOperand(0).getReg();
2631   Register Num = MI.getOperand(1).getReg();
2632   Register Den = MI.getOperand(2).getReg();
2633   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2634   MI.eraseFromParent();
2635   return true;
2636 }
2637 
2638 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2639 //
2640 // Return lo, hi of result
2641 //
2642 // %cvt.lo = G_UITOFP Val.lo
2643 // %cvt.hi = G_UITOFP Val.hi
2644 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2645 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2646 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2647 // %mul2 = G_FMUL %mul1, 2**(-32)
2648 // %trunc = G_INTRINSIC_TRUNC %mul2
2649 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2650 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2651 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2652                                                        Register Val) {
2653   const LLT S32 = LLT::scalar(32);
2654   auto Unmerge = B.buildUnmerge(S32, Val);
2655 
2656   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2657   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2658 
2659   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2660                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2661 
2662   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2663   auto Mul1 =
2664       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2665 
2666   // 2**(-32)
2667   auto Mul2 =
2668       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2669   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2670 
2671   // -(2**32)
2672   auto Mad2 = B.buildFMAD(S32, Trunc,
2673                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2674 
2675   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2676   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2677 
2678   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2679 }
2680 
2681 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2682                                                   Register DstReg,
2683                                                   Register Numer,
2684                                                   Register Denom,
2685                                                   bool IsDiv) const {
2686   const LLT S32 = LLT::scalar(32);
2687   const LLT S64 = LLT::scalar(64);
2688   const LLT S1 = LLT::scalar(1);
2689   Register RcpLo, RcpHi;
2690 
2691   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2692 
2693   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2694 
2695   auto Zero64 = B.buildConstant(S64, 0);
2696   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2697 
2698   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2699   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2700 
2701   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2702   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2703   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2704 
2705   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2706   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2707   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2708   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2709 
2710   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2711   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2712   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2713   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2714   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2715 
2716   auto Zero32 = B.buildConstant(S32, 0);
2717   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2718   auto Add2_HiC =
2719       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2720   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2721   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2722 
2723   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2724   Register NumerLo = UnmergeNumer.getReg(0);
2725   Register NumerHi = UnmergeNumer.getReg(1);
2726 
2727   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2728   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2729   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2730   Register Mul3_Lo = UnmergeMul3.getReg(0);
2731   Register Mul3_Hi = UnmergeMul3.getReg(1);
2732   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2733   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2734   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2735   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2736 
2737   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2738   Register DenomLo = UnmergeDenom.getReg(0);
2739   Register DenomHi = UnmergeDenom.getReg(1);
2740 
2741   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2742   auto C1 = B.buildSExt(S32, CmpHi);
2743 
2744   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2745   auto C2 = B.buildSExt(S32, CmpLo);
2746 
2747   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2748   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2749 
2750   // TODO: Here and below portions of the code can be enclosed into if/endif.
2751   // Currently control flow is unconditional and we have 4 selects after
2752   // potential endif to substitute PHIs.
2753 
2754   // if C3 != 0 ...
2755   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2756   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2757   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2758   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2759 
2760   auto One64 = B.buildConstant(S64, 1);
2761   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2762 
2763   auto C4 =
2764       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2765   auto C5 =
2766       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2767   auto C6 = B.buildSelect(
2768       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2769 
2770   // if (C6 != 0)
2771   auto Add4 = B.buildAdd(S64, Add3, One64);
2772   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2773 
2774   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2775   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2776   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2777 
2778   // endif C6
2779   // endif C3
2780 
2781   if (IsDiv) {
2782     auto Sel1 = B.buildSelect(
2783         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2784     B.buildSelect(DstReg,
2785                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2786   } else {
2787     auto Sel2 = B.buildSelect(
2788         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2789     B.buildSelect(DstReg,
2790                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2791   }
2792 }
2793 
2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2795                                             MachineRegisterInfo &MRI,
2796                                             MachineIRBuilder &B) const {
2797   const LLT S64 = LLT::scalar(64);
2798   const LLT S32 = LLT::scalar(32);
2799   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2800   Register DstReg = MI.getOperand(0).getReg();
2801   Register Num = MI.getOperand(1).getReg();
2802   Register Den = MI.getOperand(2).getReg();
2803   LLT Ty = MRI.getType(DstReg);
2804 
2805   if (Ty == S32)
2806     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2807   else if (Ty == S64)
2808     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2809   else
2810     return false;
2811 
2812   MI.eraseFromParent();
2813   return true;
2814 
2815 }
2816 
2817 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2818                                             MachineRegisterInfo &MRI,
2819                                             MachineIRBuilder &B) const {
2820   const LLT S64 = LLT::scalar(64);
2821   const LLT S32 = LLT::scalar(32);
2822 
2823   Register DstReg = MI.getOperand(0).getReg();
2824   const LLT Ty = MRI.getType(DstReg);
2825   if (Ty != S32 && Ty != S64)
2826     return false;
2827 
2828   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2829 
2830   Register LHS = MI.getOperand(1).getReg();
2831   Register RHS = MI.getOperand(2).getReg();
2832 
2833   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2834   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2835   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2836 
2837   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2838   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2839 
2840   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2841   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2842 
2843   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2844   if (Ty == S32)
2845     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2846   else
2847     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2848 
2849   Register Sign;
2850   if (IsDiv)
2851     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2852   else
2853     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2854 
2855   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2856   B.buildSub(DstReg, UDivRem, Sign);
2857 
2858   MI.eraseFromParent();
2859   return true;
2860 }
2861 
2862 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2863                                                  MachineRegisterInfo &MRI,
2864                                                  MachineIRBuilder &B) const {
2865   Register Res = MI.getOperand(0).getReg();
2866   Register LHS = MI.getOperand(1).getReg();
2867   Register RHS = MI.getOperand(2).getReg();
2868 
2869   uint16_t Flags = MI.getFlags();
2870 
2871   LLT ResTy = MRI.getType(Res);
2872   LLT S32 = LLT::scalar(32);
2873   LLT S64 = LLT::scalar(64);
2874 
2875   const MachineFunction &MF = B.getMF();
2876   bool Unsafe =
2877     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2878 
2879   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2880     return false;
2881 
2882   if (!Unsafe && ResTy == S32 &&
2883       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2884     return false;
2885 
2886   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2887     // 1 / x -> RCP(x)
2888     if (CLHS->isExactlyValue(1.0)) {
2889       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2890         .addUse(RHS)
2891         .setMIFlags(Flags);
2892 
2893       MI.eraseFromParent();
2894       return true;
2895     }
2896 
2897     // -1 / x -> RCP( FNEG(x) )
2898     if (CLHS->isExactlyValue(-1.0)) {
2899       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2900       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2901         .addUse(FNeg.getReg(0))
2902         .setMIFlags(Flags);
2903 
2904       MI.eraseFromParent();
2905       return true;
2906     }
2907   }
2908 
2909   // x / y -> x * (1.0 / y)
2910   if (Unsafe) {
2911     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2912       .addUse(RHS)
2913       .setMIFlags(Flags);
2914     B.buildFMul(Res, LHS, RCP, Flags);
2915 
2916     MI.eraseFromParent();
2917     return true;
2918   }
2919 
2920   return false;
2921 }
2922 
2923 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2924                                          MachineRegisterInfo &MRI,
2925                                          MachineIRBuilder &B) const {
2926   Register Res = MI.getOperand(0).getReg();
2927   Register LHS = MI.getOperand(1).getReg();
2928   Register RHS = MI.getOperand(2).getReg();
2929 
2930   uint16_t Flags = MI.getFlags();
2931 
2932   LLT S16 = LLT::scalar(16);
2933   LLT S32 = LLT::scalar(32);
2934 
2935   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2936   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2937 
2938   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2939     .addUse(RHSExt.getReg(0))
2940     .setMIFlags(Flags);
2941 
2942   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2943   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2944 
2945   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2946     .addUse(RDst.getReg(0))
2947     .addUse(RHS)
2948     .addUse(LHS)
2949     .setMIFlags(Flags);
2950 
2951   MI.eraseFromParent();
2952   return true;
2953 }
2954 
2955 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2956 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2957 static void toggleSPDenormMode(bool Enable,
2958                                MachineIRBuilder &B,
2959                                const GCNSubtarget &ST,
2960                                AMDGPU::SIModeRegisterDefaults Mode) {
2961   // Set SP denorm mode to this value.
2962   unsigned SPDenormMode =
2963     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2964 
2965   if (ST.hasDenormModeInst()) {
2966     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2967     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2968 
2969     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2970     B.buildInstr(AMDGPU::S_DENORM_MODE)
2971       .addImm(NewDenormModeValue);
2972 
2973   } else {
2974     // Select FP32 bit field in mode register.
2975     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2976                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2977                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2978 
2979     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2980       .addImm(SPDenormMode)
2981       .addImm(SPDenormModeBitField);
2982   }
2983 }
2984 
2985 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2986                                          MachineRegisterInfo &MRI,
2987                                          MachineIRBuilder &B) const {
2988   Register Res = MI.getOperand(0).getReg();
2989   Register LHS = MI.getOperand(1).getReg();
2990   Register RHS = MI.getOperand(2).getReg();
2991   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2992   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2993 
2994   uint16_t Flags = MI.getFlags();
2995 
2996   LLT S32 = LLT::scalar(32);
2997   LLT S1 = LLT::scalar(1);
2998 
2999   auto One = B.buildFConstant(S32, 1.0f);
3000 
3001   auto DenominatorScaled =
3002     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3003       .addUse(LHS)
3004       .addUse(RHS)
3005       .addImm(0)
3006       .setMIFlags(Flags);
3007   auto NumeratorScaled =
3008     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3009       .addUse(LHS)
3010       .addUse(RHS)
3011       .addImm(1)
3012       .setMIFlags(Flags);
3013 
3014   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3015     .addUse(DenominatorScaled.getReg(0))
3016     .setMIFlags(Flags);
3017   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3018 
3019   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3020   // aren't modeled as reading it.
3021   if (!Mode.allFP32Denormals())
3022     toggleSPDenormMode(true, B, ST, Mode);
3023 
3024   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3025   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3026   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3027   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3028   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3029   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3030 
3031   if (!Mode.allFP32Denormals())
3032     toggleSPDenormMode(false, B, ST, Mode);
3033 
3034   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3035     .addUse(Fma4.getReg(0))
3036     .addUse(Fma1.getReg(0))
3037     .addUse(Fma3.getReg(0))
3038     .addUse(NumeratorScaled.getReg(1))
3039     .setMIFlags(Flags);
3040 
3041   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3042     .addUse(Fmas.getReg(0))
3043     .addUse(RHS)
3044     .addUse(LHS)
3045     .setMIFlags(Flags);
3046 
3047   MI.eraseFromParent();
3048   return true;
3049 }
3050 
3051 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3052                                          MachineRegisterInfo &MRI,
3053                                          MachineIRBuilder &B) const {
3054   Register Res = MI.getOperand(0).getReg();
3055   Register LHS = MI.getOperand(1).getReg();
3056   Register RHS = MI.getOperand(2).getReg();
3057 
3058   uint16_t Flags = MI.getFlags();
3059 
3060   LLT S64 = LLT::scalar(64);
3061   LLT S1 = LLT::scalar(1);
3062 
3063   auto One = B.buildFConstant(S64, 1.0);
3064 
3065   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3066     .addUse(LHS)
3067     .addUse(RHS)
3068     .addImm(0)
3069     .setMIFlags(Flags);
3070 
3071   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3072 
3073   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3074     .addUse(DivScale0.getReg(0))
3075     .setMIFlags(Flags);
3076 
3077   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3078   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3079   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3080 
3081   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3082     .addUse(LHS)
3083     .addUse(RHS)
3084     .addImm(1)
3085     .setMIFlags(Flags);
3086 
3087   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3088   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3089   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3090 
3091   Register Scale;
3092   if (!ST.hasUsableDivScaleConditionOutput()) {
3093     // Workaround a hardware bug on SI where the condition output from div_scale
3094     // is not usable.
3095 
3096     LLT S32 = LLT::scalar(32);
3097 
3098     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3099     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3100     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3101     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3102 
3103     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3104                               Scale1Unmerge.getReg(1));
3105     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3106                               Scale0Unmerge.getReg(1));
3107     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3108   } else {
3109     Scale = DivScale1.getReg(1);
3110   }
3111 
3112   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3113     .addUse(Fma4.getReg(0))
3114     .addUse(Fma3.getReg(0))
3115     .addUse(Mul.getReg(0))
3116     .addUse(Scale)
3117     .setMIFlags(Flags);
3118 
3119   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3120     .addUse(Fmas.getReg(0))
3121     .addUse(RHS)
3122     .addUse(LHS)
3123     .setMIFlags(Flags);
3124 
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3130                                                  MachineRegisterInfo &MRI,
3131                                                  MachineIRBuilder &B) const {
3132   Register Res = MI.getOperand(0).getReg();
3133   Register LHS = MI.getOperand(2).getReg();
3134   Register RHS = MI.getOperand(3).getReg();
3135   uint16_t Flags = MI.getFlags();
3136 
3137   LLT S32 = LLT::scalar(32);
3138   LLT S1 = LLT::scalar(1);
3139 
3140   auto Abs = B.buildFAbs(S32, RHS, Flags);
3141   const APFloat C0Val(1.0f);
3142 
3143   auto C0 = B.buildConstant(S32, 0x6f800000);
3144   auto C1 = B.buildConstant(S32, 0x2f800000);
3145   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3146 
3147   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3148   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3149 
3150   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3151 
3152   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3153     .addUse(Mul0.getReg(0))
3154     .setMIFlags(Flags);
3155 
3156   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3157 
3158   B.buildFMul(Res, Sel, Mul1, Flags);
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3165 // FIXME: Why do we handle this one but not other removed instructions?
3166 //
3167 // Reciprocal square root.  The clamp prevents infinite results, clamping
3168 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3169 // +-max_float.
3170 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3171                                                     MachineRegisterInfo &MRI,
3172                                                     MachineIRBuilder &B) const {
3173   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3174     return true;
3175 
3176   Register Dst = MI.getOperand(0).getReg();
3177   Register Src = MI.getOperand(2).getReg();
3178   auto Flags = MI.getFlags();
3179 
3180   LLT Ty = MRI.getType(Dst);
3181 
3182   const fltSemantics *FltSemantics;
3183   if (Ty == LLT::scalar(32))
3184     FltSemantics = &APFloat::IEEEsingle();
3185   else if (Ty == LLT::scalar(64))
3186     FltSemantics = &APFloat::IEEEdouble();
3187   else
3188     return false;
3189 
3190   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3191     .addUse(Src)
3192     .setMIFlags(Flags);
3193 
3194   // We don't need to concern ourselves with the snan handling difference, since
3195   // the rsq quieted (or not) so use the one which will directly select.
3196   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3197   const bool UseIEEE = MFI->getMode().IEEE;
3198 
3199   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3200   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3201                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3202 
3203   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3204 
3205   if (UseIEEE)
3206     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3207   else
3208     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3209   MI.eraseFromParent();
3210   return true;
3211 }
3212 
3213 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3214   switch (IID) {
3215   case Intrinsic::amdgcn_ds_fadd:
3216     return AMDGPU::G_ATOMICRMW_FADD;
3217   case Intrinsic::amdgcn_ds_fmin:
3218     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3219   case Intrinsic::amdgcn_ds_fmax:
3220     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3221   default:
3222     llvm_unreachable("not a DS FP intrinsic");
3223   }
3224 }
3225 
3226 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3227                                                       MachineInstr &MI,
3228                                                       Intrinsic::ID IID) const {
3229   GISelChangeObserver &Observer = Helper.Observer;
3230   Observer.changingInstr(MI);
3231 
3232   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3233 
3234   // The remaining operands were used to set fields in the MemOperand on
3235   // construction.
3236   for (int I = 6; I > 3; --I)
3237     MI.RemoveOperand(I);
3238 
3239   MI.RemoveOperand(1); // Remove the intrinsic ID.
3240   Observer.changedInstr(MI);
3241   return true;
3242 }
3243 
3244 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3245                                             MachineRegisterInfo &MRI,
3246                                             MachineIRBuilder &B) const {
3247   uint64_t Offset =
3248     ST.getTargetLowering()->getImplicitParameterOffset(
3249       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3250   LLT DstTy = MRI.getType(DstReg);
3251   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3252 
3253   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3254   if (!loadInputValue(KernargPtrReg, B,
3255                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3256     return false;
3257 
3258   // FIXME: This should be nuw
3259   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3260   return true;
3261 }
3262 
3263 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3264                                                  MachineRegisterInfo &MRI,
3265                                                  MachineIRBuilder &B) const {
3266   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3267   if (!MFI->isEntryFunction()) {
3268     return legalizePreloadedArgIntrin(MI, MRI, B,
3269                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3270   }
3271 
3272   Register DstReg = MI.getOperand(0).getReg();
3273   if (!getImplicitArgPtr(DstReg, MRI, B))
3274     return false;
3275 
3276   MI.eraseFromParent();
3277   return true;
3278 }
3279 
3280 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3281                                               MachineRegisterInfo &MRI,
3282                                               MachineIRBuilder &B,
3283                                               unsigned AddrSpace) const {
3284   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3285   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3286   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3287   MI.eraseFromParent();
3288   return true;
3289 }
3290 
3291 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3292 // offset (the offset that is included in bounds checking and swizzling, to be
3293 // split between the instruction's voffset and immoffset fields) and soffset
3294 // (the offset that is excluded from bounds checking and swizzling, to go in
3295 // the instruction's soffset field).  This function takes the first kind of
3296 // offset and figures out how to split it between voffset and immoffset.
3297 std::tuple<Register, unsigned, unsigned>
3298 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3299                                         Register OrigOffset) const {
3300   const unsigned MaxImm = 4095;
3301   Register BaseReg;
3302   unsigned TotalConstOffset;
3303   MachineInstr *OffsetDef;
3304   const LLT S32 = LLT::scalar(32);
3305 
3306   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3307     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3308 
3309   unsigned ImmOffset = TotalConstOffset;
3310 
3311   // If the immediate value is too big for the immoffset field, put the value
3312   // and -4096 into the immoffset field so that the value that is copied/added
3313   // for the voffset field is a multiple of 4096, and it stands more chance
3314   // of being CSEd with the copy/add for another similar load/store.
3315   // However, do not do that rounding down to a multiple of 4096 if that is a
3316   // negative number, as it appears to be illegal to have a negative offset
3317   // in the vgpr, even if adding the immediate offset makes it positive.
3318   unsigned Overflow = ImmOffset & ~MaxImm;
3319   ImmOffset -= Overflow;
3320   if ((int32_t)Overflow < 0) {
3321     Overflow += ImmOffset;
3322     ImmOffset = 0;
3323   }
3324 
3325   if (Overflow != 0) {
3326     if (!BaseReg) {
3327       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3328     } else {
3329       auto OverflowVal = B.buildConstant(S32, Overflow);
3330       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3331     }
3332   }
3333 
3334   if (!BaseReg)
3335     BaseReg = B.buildConstant(S32, 0).getReg(0);
3336 
3337   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3338 }
3339 
3340 /// Handle register layout difference for f16 images for some subtargets.
3341 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3342                                              MachineRegisterInfo &MRI,
3343                                              Register Reg) const {
3344   if (!ST.hasUnpackedD16VMem())
3345     return Reg;
3346 
3347   const LLT S16 = LLT::scalar(16);
3348   const LLT S32 = LLT::scalar(32);
3349   LLT StoreVT = MRI.getType(Reg);
3350   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3351 
3352   auto Unmerge = B.buildUnmerge(S16, Reg);
3353 
3354   SmallVector<Register, 4> WideRegs;
3355   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3356     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3357 
3358   int NumElts = StoreVT.getNumElements();
3359 
3360   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3361 }
3362 
3363 Register AMDGPULegalizerInfo::fixStoreSourceType(
3364   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3365   MachineRegisterInfo *MRI = B.getMRI();
3366   LLT Ty = MRI->getType(VData);
3367 
3368   const LLT S16 = LLT::scalar(16);
3369 
3370   // Fixup illegal register types for i8 stores.
3371   if (Ty == LLT::scalar(8) || Ty == S16) {
3372     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3373     return AnyExt;
3374   }
3375 
3376   if (Ty.isVector()) {
3377     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3378       if (IsFormat)
3379         return handleD16VData(B, *MRI, VData);
3380     }
3381   }
3382 
3383   return VData;
3384 }
3385 
3386 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3387                                               MachineRegisterInfo &MRI,
3388                                               MachineIRBuilder &B,
3389                                               bool IsTyped,
3390                                               bool IsFormat) const {
3391   Register VData = MI.getOperand(1).getReg();
3392   LLT Ty = MRI.getType(VData);
3393   LLT EltTy = Ty.getScalarType();
3394   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3395   const LLT S32 = LLT::scalar(32);
3396 
3397   VData = fixStoreSourceType(B, VData, IsFormat);
3398   Register RSrc = MI.getOperand(2).getReg();
3399 
3400   MachineMemOperand *MMO = *MI.memoperands_begin();
3401   const int MemSize = MMO->getSize();
3402 
3403   unsigned ImmOffset;
3404   unsigned TotalOffset;
3405 
3406   // The typed intrinsics add an immediate after the registers.
3407   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3408 
3409   // The struct intrinsic variants add one additional operand over raw.
3410   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3411   Register VIndex;
3412   int OpOffset = 0;
3413   if (HasVIndex) {
3414     VIndex = MI.getOperand(3).getReg();
3415     OpOffset = 1;
3416   }
3417 
3418   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3419   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3420 
3421   unsigned Format = 0;
3422   if (IsTyped) {
3423     Format = MI.getOperand(5 + OpOffset).getImm();
3424     ++OpOffset;
3425   }
3426 
3427   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3428 
3429   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3430   if (TotalOffset != 0)
3431     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3432 
3433   unsigned Opc;
3434   if (IsTyped) {
3435     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3436                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3437   } else if (IsFormat) {
3438     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3439                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3440   } else {
3441     switch (MemSize) {
3442     case 1:
3443       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3444       break;
3445     case 2:
3446       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3447       break;
3448     default:
3449       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3450       break;
3451     }
3452   }
3453 
3454   if (!VIndex)
3455     VIndex = B.buildConstant(S32, 0).getReg(0);
3456 
3457   auto MIB = B.buildInstr(Opc)
3458     .addUse(VData)              // vdata
3459     .addUse(RSrc)               // rsrc
3460     .addUse(VIndex)             // vindex
3461     .addUse(VOffset)            // voffset
3462     .addUse(SOffset)            // soffset
3463     .addImm(ImmOffset);         // offset(imm)
3464 
3465   if (IsTyped)
3466     MIB.addImm(Format);
3467 
3468   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3469      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3470      .addMemOperand(MMO);
3471 
3472   MI.eraseFromParent();
3473   return true;
3474 }
3475 
3476 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3477                                              MachineRegisterInfo &MRI,
3478                                              MachineIRBuilder &B,
3479                                              bool IsFormat,
3480                                              bool IsTyped) const {
3481   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3482   MachineMemOperand *MMO = *MI.memoperands_begin();
3483   const int MemSize = MMO->getSize();
3484   const LLT S32 = LLT::scalar(32);
3485 
3486   Register Dst = MI.getOperand(0).getReg();
3487   Register RSrc = MI.getOperand(2).getReg();
3488 
3489   // The typed intrinsics add an immediate after the registers.
3490   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3491 
3492   // The struct intrinsic variants add one additional operand over raw.
3493   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3494   Register VIndex;
3495   int OpOffset = 0;
3496   if (HasVIndex) {
3497     VIndex = MI.getOperand(3).getReg();
3498     OpOffset = 1;
3499   }
3500 
3501   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3502   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3503 
3504   unsigned Format = 0;
3505   if (IsTyped) {
3506     Format = MI.getOperand(5 + OpOffset).getImm();
3507     ++OpOffset;
3508   }
3509 
3510   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3511   unsigned ImmOffset;
3512   unsigned TotalOffset;
3513 
3514   LLT Ty = MRI.getType(Dst);
3515   LLT EltTy = Ty.getScalarType();
3516   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3517   const bool Unpacked = ST.hasUnpackedD16VMem();
3518 
3519   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3520   if (TotalOffset != 0)
3521     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3522 
3523   unsigned Opc;
3524 
3525   if (IsTyped) {
3526     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3527                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3528   } else if (IsFormat) {
3529     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3530                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3531   } else {
3532     switch (MemSize) {
3533     case 1:
3534       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3535       break;
3536     case 2:
3537       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3538       break;
3539     default:
3540       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3541       break;
3542     }
3543   }
3544 
3545   Register LoadDstReg;
3546 
3547   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3548   LLT UnpackedTy = Ty.changeElementSize(32);
3549 
3550   if (IsExtLoad)
3551     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3552   else if (Unpacked && IsD16 && Ty.isVector())
3553     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3554   else
3555     LoadDstReg = Dst;
3556 
3557   if (!VIndex)
3558     VIndex = B.buildConstant(S32, 0).getReg(0);
3559 
3560   auto MIB = B.buildInstr(Opc)
3561     .addDef(LoadDstReg)         // vdata
3562     .addUse(RSrc)               // rsrc
3563     .addUse(VIndex)             // vindex
3564     .addUse(VOffset)            // voffset
3565     .addUse(SOffset)            // soffset
3566     .addImm(ImmOffset);         // offset(imm)
3567 
3568   if (IsTyped)
3569     MIB.addImm(Format);
3570 
3571   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3572      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3573      .addMemOperand(MMO);
3574 
3575   if (LoadDstReg != Dst) {
3576     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3577 
3578     // Widen result for extending loads was widened.
3579     if (IsExtLoad)
3580       B.buildTrunc(Dst, LoadDstReg);
3581     else {
3582       // Repack to original 16-bit vector result
3583       // FIXME: G_TRUNC should work, but legalization currently fails
3584       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3585       SmallVector<Register, 4> Repack;
3586       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3587         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3588       B.buildMerge(Dst, Repack);
3589     }
3590   }
3591 
3592   MI.eraseFromParent();
3593   return true;
3594 }
3595 
3596 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3597                                                MachineIRBuilder &B,
3598                                                bool IsInc) const {
3599   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3600                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3601   B.buildInstr(Opc)
3602     .addDef(MI.getOperand(0).getReg())
3603     .addUse(MI.getOperand(2).getReg())
3604     .addUse(MI.getOperand(3).getReg())
3605     .cloneMemRefs(MI);
3606   MI.eraseFromParent();
3607   return true;
3608 }
3609 
3610 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3611   switch (IntrID) {
3612   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3613   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3614     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3615   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3616   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3617     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3618   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3619   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3620     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3621   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3622   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3623     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3624   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3625   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3626     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3627   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3628   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3629     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3630   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3631   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3632     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3633   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3634   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3635     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3636   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3637   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3638     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3639   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3640   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3641     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3642   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3643   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3644     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3645   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3646   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3647     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3648   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3649   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3650     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3651   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3652   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3653     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3654   default:
3655     llvm_unreachable("unhandled atomic opcode");
3656   }
3657 }
3658 
3659 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3660                                                MachineIRBuilder &B,
3661                                                Intrinsic::ID IID) const {
3662   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3663                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3664   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3665 
3666   Register Dst;
3667 
3668   int OpOffset = 0;
3669   if (HasReturn) {
3670     // A few FP atomics do not support return values.
3671     Dst = MI.getOperand(0).getReg();
3672   } else {
3673     OpOffset = -1;
3674   }
3675 
3676   Register VData = MI.getOperand(2 + OpOffset).getReg();
3677   Register CmpVal;
3678 
3679   if (IsCmpSwap) {
3680     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3681     ++OpOffset;
3682   }
3683 
3684   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3685   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3686 
3687   // The struct intrinsic variants add one additional operand over raw.
3688   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3689   Register VIndex;
3690   if (HasVIndex) {
3691     VIndex = MI.getOperand(4 + OpOffset).getReg();
3692     ++OpOffset;
3693   }
3694 
3695   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3696   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3697   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3698 
3699   MachineMemOperand *MMO = *MI.memoperands_begin();
3700 
3701   unsigned ImmOffset;
3702   unsigned TotalOffset;
3703   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3704   if (TotalOffset != 0)
3705     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3706 
3707   if (!VIndex)
3708     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3709 
3710   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3711 
3712   if (HasReturn)
3713     MIB.addDef(Dst);
3714 
3715   MIB.addUse(VData); // vdata
3716 
3717   if (IsCmpSwap)
3718     MIB.addReg(CmpVal);
3719 
3720   MIB.addUse(RSrc)               // rsrc
3721      .addUse(VIndex)             // vindex
3722      .addUse(VOffset)            // voffset
3723      .addUse(SOffset)            // soffset
3724      .addImm(ImmOffset)          // offset(imm)
3725      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3726      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3727      .addMemOperand(MMO);
3728 
3729   MI.eraseFromParent();
3730   return true;
3731 }
3732 
3733 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3734 /// vector with s16 typed elements.
3735 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3736                                         SmallVectorImpl<Register> &PackedAddrs,
3737                                         int AddrIdx, int DimIdx, int EndIdx,
3738                                         int NumGradients) {
3739   const LLT S16 = LLT::scalar(16);
3740   const LLT V2S16 = LLT::vector(2, 16);
3741 
3742   for (int I = AddrIdx; I < EndIdx; ++I) {
3743     MachineOperand &SrcOp = MI.getOperand(I);
3744     if (!SrcOp.isReg())
3745       continue; // _L to _LZ may have eliminated this.
3746 
3747     Register AddrReg = SrcOp.getReg();
3748 
3749     if (I < DimIdx) {
3750       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3751       PackedAddrs.push_back(AddrReg);
3752     } else {
3753       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3754       // derivatives dx/dh and dx/dv are packed with undef.
3755       if (((I + 1) >= EndIdx) ||
3756           ((NumGradients / 2) % 2 == 1 &&
3757            (I == DimIdx + (NumGradients / 2) - 1 ||
3758             I == DimIdx + NumGradients - 1)) ||
3759           // Check for _L to _LZ optimization
3760           !MI.getOperand(I + 1).isReg()) {
3761         PackedAddrs.push_back(
3762             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3763                 .getReg(0));
3764       } else {
3765         PackedAddrs.push_back(
3766             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3767                 .getReg(0));
3768         ++I;
3769       }
3770     }
3771   }
3772 }
3773 
3774 /// Convert from separate vaddr components to a single vector address register,
3775 /// and replace the remaining operands with $noreg.
3776 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3777                                      int DimIdx, int NumVAddrs) {
3778   const LLT S32 = LLT::scalar(32);
3779 
3780   SmallVector<Register, 8> AddrRegs;
3781   for (int I = 0; I != NumVAddrs; ++I) {
3782     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3783     if (SrcOp.isReg()) {
3784       AddrRegs.push_back(SrcOp.getReg());
3785       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3786     }
3787   }
3788 
3789   int NumAddrRegs = AddrRegs.size();
3790   if (NumAddrRegs != 1) {
3791     // Round up to 8 elements for v5-v7
3792     // FIXME: Missing intermediate sized register classes and instructions.
3793     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3794       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3795       auto Undef = B.buildUndef(S32);
3796       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3797       NumAddrRegs = RoundedNumRegs;
3798     }
3799 
3800     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3801     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3802   }
3803 
3804   for (int I = 1; I != NumVAddrs; ++I) {
3805     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3806     if (SrcOp.isReg())
3807       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3808   }
3809 }
3810 
3811 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3812 ///
3813 /// Depending on the subtarget, load/store with 16-bit element data need to be
3814 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3815 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3816 /// registers.
3817 ///
3818 /// We don't want to directly select image instructions just yet, but also want
3819 /// to exposes all register repacking to the legalizer/combiners. We also don't
3820 /// want a selected instrution entering RegBankSelect. In order to avoid
3821 /// defining a multitude of intermediate image instructions, directly hack on
3822 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3823 /// now unnecessary arguments with $noreg.
3824 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3825     MachineInstr &MI, MachineIRBuilder &B,
3826     GISelChangeObserver &Observer,
3827     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3828 
3829   const int NumDefs = MI.getNumExplicitDefs();
3830   bool IsTFE = NumDefs == 2;
3831   // We are only processing the operands of d16 image operations on subtargets
3832   // that use the unpacked register layout, or need to repack the TFE result.
3833 
3834   // TODO: Do we need to guard against already legalized intrinsics?
3835   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3836     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3837 
3838   MachineRegisterInfo *MRI = B.getMRI();
3839   const LLT S32 = LLT::scalar(32);
3840   const LLT S16 = LLT::scalar(16);
3841   const LLT V2S16 = LLT::vector(2, 16);
3842 
3843   // Index of first address argument
3844   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3845 
3846   int NumVAddrs, NumGradients;
3847   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3848   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3849     getDMaskIdx(BaseOpcode, NumDefs);
3850   unsigned DMask = 0;
3851 
3852   // Check for 16 bit addresses and pack if true.
3853   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3854   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3855   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3856   const bool IsG16 = GradTy == S16;
3857   const bool IsA16 = AddrTy == S16;
3858 
3859   int DMaskLanes = 0;
3860   if (!BaseOpcode->Atomic) {
3861     DMask = MI.getOperand(DMaskIdx).getImm();
3862     if (BaseOpcode->Gather4) {
3863       DMaskLanes = 4;
3864     } else if (DMask != 0) {
3865       DMaskLanes = countPopulation(DMask);
3866     } else if (!IsTFE && !BaseOpcode->Store) {
3867       // If dmask is 0, this is a no-op load. This can be eliminated.
3868       B.buildUndef(MI.getOperand(0));
3869       MI.eraseFromParent();
3870       return true;
3871     }
3872   }
3873 
3874   Observer.changingInstr(MI);
3875   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3876 
3877   unsigned NewOpcode = NumDefs == 0 ?
3878     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3879 
3880   // Track that we legalized this
3881   MI.setDesc(B.getTII().get(NewOpcode));
3882 
3883   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3884   // dmask to be at least 1 otherwise the instruction will fail
3885   if (IsTFE && DMask == 0) {
3886     DMask = 0x1;
3887     DMaskLanes = 1;
3888     MI.getOperand(DMaskIdx).setImm(DMask);
3889   }
3890 
3891   if (BaseOpcode->Atomic) {
3892     Register VData0 = MI.getOperand(2).getReg();
3893     LLT Ty = MRI->getType(VData0);
3894 
3895     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3896     if (Ty.isVector())
3897       return false;
3898 
3899     if (BaseOpcode->AtomicX2) {
3900       Register VData1 = MI.getOperand(3).getReg();
3901       // The two values are packed in one register.
3902       LLT PackedTy = LLT::vector(2, Ty);
3903       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3904       MI.getOperand(2).setReg(Concat.getReg(0));
3905       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3906     }
3907   }
3908 
3909   int CorrectedNumVAddrs = NumVAddrs;
3910 
3911   // Optimize _L to _LZ when _L is zero
3912   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3913         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3914     const ConstantFP *ConstantLod;
3915     const int LodIdx = AddrIdx + NumVAddrs - 1;
3916 
3917     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3918       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3919         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3920         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3921           LZMappingInfo->LZ, ImageDimIntr->Dim);
3922 
3923         // The starting indexes should remain in the same place.
3924         --NumVAddrs;
3925         --CorrectedNumVAddrs;
3926 
3927         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3928           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3929         MI.RemoveOperand(LodIdx);
3930       }
3931     }
3932   }
3933 
3934   // Optimize _mip away, when 'lod' is zero
3935   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3936     int64_t ConstantLod;
3937     const int LodIdx = AddrIdx + NumVAddrs - 1;
3938 
3939     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3940       if (ConstantLod == 0) {
3941         // TODO: Change intrinsic opcode and remove operand instead or replacing
3942         // it with 0, as the _L to _LZ handling is done above.
3943         MI.getOperand(LodIdx).ChangeToImmediate(0);
3944         --CorrectedNumVAddrs;
3945       }
3946     }
3947   }
3948 
3949   // Rewrite the addressing register layout before doing anything else.
3950   if (IsA16 || IsG16) {
3951     if (IsA16) {
3952       // Target must support the feature and gradients need to be 16 bit too
3953       if (!ST.hasA16() || !IsG16)
3954         return false;
3955     } else if (!ST.hasG16())
3956       return false;
3957 
3958     if (NumVAddrs > 1) {
3959       SmallVector<Register, 4> PackedRegs;
3960       // Don't compress addresses for G16
3961       const int PackEndIdx =
3962           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3963       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3964                                   PackEndIdx, NumGradients);
3965 
3966       if (!IsA16) {
3967         // Add uncompressed address
3968         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3969           int AddrReg = MI.getOperand(I).getReg();
3970           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3971           PackedRegs.push_back(AddrReg);
3972         }
3973       }
3974 
3975       // See also below in the non-a16 branch
3976       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3977 
3978       if (!UseNSA && PackedRegs.size() > 1) {
3979         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3980         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3981         PackedRegs[0] = Concat.getReg(0);
3982         PackedRegs.resize(1);
3983       }
3984 
3985       const int NumPacked = PackedRegs.size();
3986       for (int I = 0; I != NumVAddrs; ++I) {
3987         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3988         if (!SrcOp.isReg()) {
3989           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3990           continue;
3991         }
3992 
3993         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3994 
3995         if (I < NumPacked)
3996           SrcOp.setReg(PackedRegs[I]);
3997         else
3998           SrcOp.setReg(AMDGPU::NoRegister);
3999       }
4000     }
4001   } else {
4002     // If the register allocator cannot place the address registers contiguously
4003     // without introducing moves, then using the non-sequential address encoding
4004     // is always preferable, since it saves VALU instructions and is usually a
4005     // wash in terms of code size or even better.
4006     //
4007     // However, we currently have no way of hinting to the register allocator
4008     // that MIMG addresses should be placed contiguously when it is possible to
4009     // do so, so force non-NSA for the common 2-address case as a heuristic.
4010     //
4011     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4012     // allocation when possible.
4013     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4014 
4015     if (!UseNSA && NumVAddrs > 1)
4016       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4017   }
4018 
4019   int Flags = 0;
4020   if (IsA16)
4021     Flags |= 1;
4022   if (IsG16)
4023     Flags |= 2;
4024   MI.addOperand(MachineOperand::CreateImm(Flags));
4025 
4026   if (BaseOpcode->Store) { // No TFE for stores?
4027     // TODO: Handle dmask trim
4028     Register VData = MI.getOperand(1).getReg();
4029     LLT Ty = MRI->getType(VData);
4030     if (!Ty.isVector() || Ty.getElementType() != S16)
4031       return true;
4032 
4033     Register RepackedReg = handleD16VData(B, *MRI, VData);
4034     if (RepackedReg != VData) {
4035       MI.getOperand(1).setReg(RepackedReg);
4036     }
4037 
4038     return true;
4039   }
4040 
4041   Register DstReg = MI.getOperand(0).getReg();
4042   LLT Ty = MRI->getType(DstReg);
4043   const LLT EltTy = Ty.getScalarType();
4044   const bool IsD16 = Ty.getScalarType() == S16;
4045   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4046 
4047   // Confirm that the return type is large enough for the dmask specified
4048   if (NumElts < DMaskLanes)
4049     return false;
4050 
4051   if (NumElts > 4 || DMaskLanes > 4)
4052     return false;
4053 
4054   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4055   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4056 
4057   // The raw dword aligned data component of the load. The only legal cases
4058   // where this matters should be when using the packed D16 format, for
4059   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4060   LLT RoundedTy;
4061 
4062   // S32 vector to to cover all data, plus TFE result element.
4063   LLT TFETy;
4064 
4065   // Register type to use for each loaded component. Will be S32 or V2S16.
4066   LLT RegTy;
4067 
4068   if (IsD16 && ST.hasUnpackedD16VMem()) {
4069     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4070     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4071     RegTy = S32;
4072   } else {
4073     unsigned EltSize = EltTy.getSizeInBits();
4074     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4075     unsigned RoundedSize = 32 * RoundedElts;
4076     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4077     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4078     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4079   }
4080 
4081   // The return type does not need adjustment.
4082   // TODO: Should we change s16 case to s32 or <2 x s16>?
4083   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4084     return true;
4085 
4086   Register Dst1Reg;
4087 
4088   // Insert after the instruction.
4089   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4090 
4091   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4092   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4093   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4094   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4095 
4096   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4097 
4098   MI.getOperand(0).setReg(NewResultReg);
4099 
4100   // In the IR, TFE is supposed to be used with a 2 element struct return
4101   // type. The intruction really returns these two values in one contiguous
4102   // register, with one additional dword beyond the loaded data. Rewrite the
4103   // return type to use a single register result.
4104 
4105   if (IsTFE) {
4106     Dst1Reg = MI.getOperand(1).getReg();
4107     if (MRI->getType(Dst1Reg) != S32)
4108       return false;
4109 
4110     // TODO: Make sure the TFE operand bit is set.
4111     MI.RemoveOperand(1);
4112 
4113     // Handle the easy case that requires no repack instructions.
4114     if (Ty == S32) {
4115       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4116       return true;
4117     }
4118   }
4119 
4120   // Now figure out how to copy the new result register back into the old
4121   // result.
4122   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4123 
4124   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4125 
4126   if (ResultNumRegs == 1) {
4127     assert(!IsTFE);
4128     ResultRegs[0] = NewResultReg;
4129   } else {
4130     // We have to repack into a new vector of some kind.
4131     for (int I = 0; I != NumDataRegs; ++I)
4132       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4133     B.buildUnmerge(ResultRegs, NewResultReg);
4134 
4135     // Drop the final TFE element to get the data part. The TFE result is
4136     // directly written to the right place already.
4137     if (IsTFE)
4138       ResultRegs.resize(NumDataRegs);
4139   }
4140 
4141   // For an s16 scalar result, we form an s32 result with a truncate regardless
4142   // of packed vs. unpacked.
4143   if (IsD16 && !Ty.isVector()) {
4144     B.buildTrunc(DstReg, ResultRegs[0]);
4145     return true;
4146   }
4147 
4148   // Avoid a build/concat_vector of 1 entry.
4149   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4150     B.buildBitcast(DstReg, ResultRegs[0]);
4151     return true;
4152   }
4153 
4154   assert(Ty.isVector());
4155 
4156   if (IsD16) {
4157     // For packed D16 results with TFE enabled, all the data components are
4158     // S32. Cast back to the expected type.
4159     //
4160     // TODO: We don't really need to use load s32 elements. We would only need one
4161     // cast for the TFE result if a multiple of v2s16 was used.
4162     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4163       for (Register &Reg : ResultRegs)
4164         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4165     } else if (ST.hasUnpackedD16VMem()) {
4166       for (Register &Reg : ResultRegs)
4167         Reg = B.buildTrunc(S16, Reg).getReg(0);
4168     }
4169   }
4170 
4171   auto padWithUndef = [&](LLT Ty, int NumElts) {
4172     if (NumElts == 0)
4173       return;
4174     Register Undef = B.buildUndef(Ty).getReg(0);
4175     for (int I = 0; I != NumElts; ++I)
4176       ResultRegs.push_back(Undef);
4177   };
4178 
4179   // Pad out any elements eliminated due to the dmask.
4180   LLT ResTy = MRI->getType(ResultRegs[0]);
4181   if (!ResTy.isVector()) {
4182     padWithUndef(ResTy, NumElts - ResultRegs.size());
4183     B.buildBuildVector(DstReg, ResultRegs);
4184     return true;
4185   }
4186 
4187   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4188   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4189 
4190   // Deal with the one annoying legal case.
4191   const LLT V3S16 = LLT::vector(3, 16);
4192   if (Ty == V3S16) {
4193     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4194     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4195     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4196     return true;
4197   }
4198 
4199   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4200   B.buildConcatVectors(DstReg, ResultRegs);
4201   return true;
4202 }
4203 
4204 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4205   LegalizerHelper &Helper, MachineInstr &MI) const {
4206   MachineIRBuilder &B = Helper.MIRBuilder;
4207   GISelChangeObserver &Observer = Helper.Observer;
4208 
4209   Register Dst = MI.getOperand(0).getReg();
4210   LLT Ty = B.getMRI()->getType(Dst);
4211   unsigned Size = Ty.getSizeInBits();
4212   MachineFunction &MF = B.getMF();
4213 
4214   Observer.changingInstr(MI);
4215 
4216   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4217     Ty = getBitcastRegisterType(Ty);
4218     Helper.bitcastDst(MI, Ty, 0);
4219     Dst = MI.getOperand(0).getReg();
4220     B.setInsertPt(B.getMBB(), MI);
4221   }
4222 
4223   // FIXME: We don't really need this intermediate instruction. The intrinsic
4224   // should be fixed to have a memory operand. Since it's readnone, we're not
4225   // allowed to add one.
4226   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4227   MI.RemoveOperand(1); // Remove intrinsic ID
4228 
4229   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4230   // TODO: Should this use datalayout alignment?
4231   const unsigned MemSize = (Size + 7) / 8;
4232   const Align MemAlign(4);
4233   MachineMemOperand *MMO = MF.getMachineMemOperand(
4234       MachinePointerInfo(),
4235       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4236           MachineMemOperand::MOInvariant,
4237       MemSize, MemAlign);
4238   MI.addMemOperand(MF, MMO);
4239 
4240   // There are no 96-bit result scalar loads, but widening to 128-bit should
4241   // always be legal. We may need to restore this to a 96-bit result if it turns
4242   // out this needs to be converted to a vector load during RegBankSelect.
4243   if (!isPowerOf2_32(Size)) {
4244     if (Ty.isVector())
4245       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4246     else
4247       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4248   }
4249 
4250   Observer.changedInstr(MI);
4251   return true;
4252 }
4253 
4254 // TODO: Move to selection
4255 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4256                                                 MachineRegisterInfo &MRI,
4257                                                 MachineIRBuilder &B) const {
4258   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4259   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4260       !ST.isTrapHandlerEnabled()) {
4261     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4262   } else {
4263     // Pass queue pointer to trap handler as input, and insert trap instruction
4264     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4265     MachineRegisterInfo &MRI = *B.getMRI();
4266 
4267     Register LiveIn =
4268       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4269     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4270       return false;
4271 
4272     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4273     B.buildCopy(SGPR01, LiveIn);
4274     B.buildInstr(AMDGPU::S_TRAP)
4275         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4276         .addReg(SGPR01, RegState::Implicit);
4277   }
4278 
4279   MI.eraseFromParent();
4280   return true;
4281 }
4282 
4283 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4284     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4285   // Is non-HSA path or trap-handler disabled? then, report a warning
4286   // accordingly
4287   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4288       !ST.isTrapHandlerEnabled()) {
4289     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4290                                      "debugtrap handler not supported",
4291                                      MI.getDebugLoc(), DS_Warning);
4292     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4293     Ctx.diagnose(NoTrap);
4294   } else {
4295     // Insert debug-trap instruction
4296     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4297   }
4298 
4299   MI.eraseFromParent();
4300   return true;
4301 }
4302 
4303 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4304                                             MachineInstr &MI) const {
4305   MachineIRBuilder &B = Helper.MIRBuilder;
4306   MachineRegisterInfo &MRI = *B.getMRI();
4307 
4308   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4309   auto IntrID = MI.getIntrinsicID();
4310   switch (IntrID) {
4311   case Intrinsic::amdgcn_if:
4312   case Intrinsic::amdgcn_else: {
4313     MachineInstr *Br = nullptr;
4314     MachineBasicBlock *UncondBrTarget = nullptr;
4315     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4316       const SIRegisterInfo *TRI
4317         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4318 
4319       Register Def = MI.getOperand(1).getReg();
4320       Register Use = MI.getOperand(3).getReg();
4321 
4322       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4323       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4324       if (IntrID == Intrinsic::amdgcn_if) {
4325         B.buildInstr(AMDGPU::SI_IF)
4326           .addDef(Def)
4327           .addUse(Use)
4328           .addMBB(UncondBrTarget);
4329       } else {
4330         B.buildInstr(AMDGPU::SI_ELSE)
4331           .addDef(Def)
4332           .addUse(Use)
4333           .addMBB(UncondBrTarget)
4334           .addImm(0);
4335       }
4336 
4337       if (Br) {
4338         Br->getOperand(0).setMBB(CondBrTarget);
4339       } else {
4340         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4341         // since we're swapping branch targets it needs to be reinserted.
4342         // FIXME: IRTranslator should probably not do this
4343         B.buildBr(*CondBrTarget);
4344       }
4345 
4346       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4347       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4348       MI.eraseFromParent();
4349       BrCond->eraseFromParent();
4350       return true;
4351     }
4352 
4353     return false;
4354   }
4355   case Intrinsic::amdgcn_loop: {
4356     MachineInstr *Br = nullptr;
4357     MachineBasicBlock *UncondBrTarget = nullptr;
4358     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4359       const SIRegisterInfo *TRI
4360         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4361 
4362       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4363       Register Reg = MI.getOperand(2).getReg();
4364 
4365       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4366       B.buildInstr(AMDGPU::SI_LOOP)
4367         .addUse(Reg)
4368         .addMBB(UncondBrTarget);
4369 
4370       if (Br)
4371         Br->getOperand(0).setMBB(CondBrTarget);
4372       else
4373         B.buildBr(*CondBrTarget);
4374 
4375       MI.eraseFromParent();
4376       BrCond->eraseFromParent();
4377       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4378       return true;
4379     }
4380 
4381     return false;
4382   }
4383   case Intrinsic::amdgcn_kernarg_segment_ptr:
4384     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4385       // This only makes sense to call in a kernel, so just lower to null.
4386       B.buildConstant(MI.getOperand(0).getReg(), 0);
4387       MI.eraseFromParent();
4388       return true;
4389     }
4390 
4391     return legalizePreloadedArgIntrin(
4392       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4393   case Intrinsic::amdgcn_implicitarg_ptr:
4394     return legalizeImplicitArgPtr(MI, MRI, B);
4395   case Intrinsic::amdgcn_workitem_id_x:
4396     return legalizePreloadedArgIntrin(MI, MRI, B,
4397                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4398   case Intrinsic::amdgcn_workitem_id_y:
4399     return legalizePreloadedArgIntrin(MI, MRI, B,
4400                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4401   case Intrinsic::amdgcn_workitem_id_z:
4402     return legalizePreloadedArgIntrin(MI, MRI, B,
4403                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4404   case Intrinsic::amdgcn_workgroup_id_x:
4405     return legalizePreloadedArgIntrin(MI, MRI, B,
4406                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4407   case Intrinsic::amdgcn_workgroup_id_y:
4408     return legalizePreloadedArgIntrin(MI, MRI, B,
4409                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4410   case Intrinsic::amdgcn_workgroup_id_z:
4411     return legalizePreloadedArgIntrin(MI, MRI, B,
4412                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4413   case Intrinsic::amdgcn_dispatch_ptr:
4414     return legalizePreloadedArgIntrin(MI, MRI, B,
4415                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4416   case Intrinsic::amdgcn_queue_ptr:
4417     return legalizePreloadedArgIntrin(MI, MRI, B,
4418                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4419   case Intrinsic::amdgcn_implicit_buffer_ptr:
4420     return legalizePreloadedArgIntrin(
4421       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4422   case Intrinsic::amdgcn_dispatch_id:
4423     return legalizePreloadedArgIntrin(MI, MRI, B,
4424                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4425   case Intrinsic::amdgcn_fdiv_fast:
4426     return legalizeFDIVFastIntrin(MI, MRI, B);
4427   case Intrinsic::amdgcn_is_shared:
4428     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4429   case Intrinsic::amdgcn_is_private:
4430     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4431   case Intrinsic::amdgcn_wavefrontsize: {
4432     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4433     MI.eraseFromParent();
4434     return true;
4435   }
4436   case Intrinsic::amdgcn_s_buffer_load:
4437     return legalizeSBufferLoad(Helper, MI);
4438   case Intrinsic::amdgcn_raw_buffer_store:
4439   case Intrinsic::amdgcn_struct_buffer_store:
4440     return legalizeBufferStore(MI, MRI, B, false, false);
4441   case Intrinsic::amdgcn_raw_buffer_store_format:
4442   case Intrinsic::amdgcn_struct_buffer_store_format:
4443     return legalizeBufferStore(MI, MRI, B, false, true);
4444   case Intrinsic::amdgcn_raw_tbuffer_store:
4445   case Intrinsic::amdgcn_struct_tbuffer_store:
4446     return legalizeBufferStore(MI, MRI, B, true, true);
4447   case Intrinsic::amdgcn_raw_buffer_load:
4448   case Intrinsic::amdgcn_struct_buffer_load:
4449     return legalizeBufferLoad(MI, MRI, B, false, false);
4450   case Intrinsic::amdgcn_raw_buffer_load_format:
4451   case Intrinsic::amdgcn_struct_buffer_load_format:
4452     return legalizeBufferLoad(MI, MRI, B, true, false);
4453   case Intrinsic::amdgcn_raw_tbuffer_load:
4454   case Intrinsic::amdgcn_struct_tbuffer_load:
4455     return legalizeBufferLoad(MI, MRI, B, true, true);
4456   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4457   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4458   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4459   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4460   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4461   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4462   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4463   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4464   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4465   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4466   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4467   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4468   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4469   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4470   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4471   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4472   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4473   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4474   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4475   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4476   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4477   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4478   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4479   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4480   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4481   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4482   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4483   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4484     return legalizeBufferAtomic(MI, B, IntrID);
4485   case Intrinsic::amdgcn_atomic_inc:
4486     return legalizeAtomicIncDec(MI, B, true);
4487   case Intrinsic::amdgcn_atomic_dec:
4488     return legalizeAtomicIncDec(MI, B, false);
4489   case Intrinsic::trap:
4490     return legalizeTrapIntrinsic(MI, MRI, B);
4491   case Intrinsic::debugtrap:
4492     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4493   case Intrinsic::amdgcn_rsq_clamp:
4494     return legalizeRsqClampIntrinsic(MI, MRI, B);
4495   case Intrinsic::amdgcn_ds_fadd:
4496   case Intrinsic::amdgcn_ds_fmin:
4497   case Intrinsic::amdgcn_ds_fmax:
4498     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4499   default: {
4500     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4501             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4502       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4503     return true;
4504   }
4505   }
4506 
4507   return true;
4508 }
4509