1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
364                                          const GCNTargetMachine &TM)
365   :  ST(ST_) {
366   using namespace TargetOpcode;
367 
368   auto GetAddrSpacePtr = [&TM](unsigned AS) {
369     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
370   };
371 
372   const LLT S1 = LLT::scalar(1);
373   const LLT S16 = LLT::scalar(16);
374   const LLT S32 = LLT::scalar(32);
375   const LLT S64 = LLT::scalar(64);
376   const LLT S128 = LLT::scalar(128);
377   const LLT S256 = LLT::scalar(256);
378   const LLT S512 = LLT::scalar(512);
379   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
380 
381   const LLT V2S16 = LLT::vector(2, 16);
382   const LLT V4S16 = LLT::vector(4, 16);
383 
384   const LLT V2S32 = LLT::vector(2, 32);
385   const LLT V3S32 = LLT::vector(3, 32);
386   const LLT V4S32 = LLT::vector(4, 32);
387   const LLT V5S32 = LLT::vector(5, 32);
388   const LLT V6S32 = LLT::vector(6, 32);
389   const LLT V7S32 = LLT::vector(7, 32);
390   const LLT V8S32 = LLT::vector(8, 32);
391   const LLT V9S32 = LLT::vector(9, 32);
392   const LLT V10S32 = LLT::vector(10, 32);
393   const LLT V11S32 = LLT::vector(11, 32);
394   const LLT V12S32 = LLT::vector(12, 32);
395   const LLT V13S32 = LLT::vector(13, 32);
396   const LLT V14S32 = LLT::vector(14, 32);
397   const LLT V15S32 = LLT::vector(15, 32);
398   const LLT V16S32 = LLT::vector(16, 32);
399   const LLT V32S32 = LLT::vector(32, 32);
400 
401   const LLT V2S64 = LLT::vector(2, 64);
402   const LLT V3S64 = LLT::vector(3, 64);
403   const LLT V4S64 = LLT::vector(4, 64);
404   const LLT V5S64 = LLT::vector(5, 64);
405   const LLT V6S64 = LLT::vector(6, 64);
406   const LLT V7S64 = LLT::vector(7, 64);
407   const LLT V8S64 = LLT::vector(8, 64);
408   const LLT V16S64 = LLT::vector(16, 64);
409 
410   std::initializer_list<LLT> AllS32Vectors =
411     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
412      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
413   std::initializer_list<LLT> AllS64Vectors =
414     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
415 
416   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
417   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
418   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
419   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
420   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
421   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
422   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
423 
424   const LLT CodePtr = FlatPtr;
425 
426   const std::initializer_list<LLT> AddrSpaces64 = {
427     GlobalPtr, ConstantPtr, FlatPtr
428   };
429 
430   const std::initializer_list<LLT> AddrSpaces32 = {
431     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
432   };
433 
434   const std::initializer_list<LLT> FPTypesBase = {
435     S32, S64
436   };
437 
438   const std::initializer_list<LLT> FPTypes16 = {
439     S32, S64, S16
440   };
441 
442   const std::initializer_list<LLT> FPTypesPK16 = {
443     S32, S64, S16, V2S16
444   };
445 
446   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
447 
448   setAction({G_BRCOND, S1}, Legal); // VCC branches
449   setAction({G_BRCOND, S32}, Legal); // SCC branches
450 
451   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
452   // elements for v3s16
453   getActionDefinitionsBuilder(G_PHI)
454     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
455     .legalFor(AllS32Vectors)
456     .legalFor(AllS64Vectors)
457     .legalFor(AddrSpaces64)
458     .legalFor(AddrSpaces32)
459     .legalIf(isPointer(0))
460     .clampScalar(0, S16, S256)
461     .widenScalarToNextPow2(0, 32)
462     .clampMaxNumElements(0, S32, 16)
463     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
464     .scalarize(0);
465 
466   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
467     // Full set of gfx9 features.
468     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
469       .legalFor({S32, S16, V2S16})
470       .clampScalar(0, S16, S32)
471       .clampMaxNumElements(0, S16, 2)
472       .scalarize(0)
473       .widenScalarToNextPow2(0, 32);
474 
475     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
476       .legalFor({S32, S16, V2S16}) // Clamp modifier
477       .minScalarOrElt(0, S16)
478       .clampMaxNumElements(0, S16, 2)
479       .scalarize(0)
480       .widenScalarToNextPow2(0, 32)
481       .lower();
482   } else if (ST.has16BitInsts()) {
483     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
484       .legalFor({S32, S16})
485       .clampScalar(0, S16, S32)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
488 
489     // Technically the saturating operations require clamp bit support, but this
490     // was introduced at the same time as 16-bit operations.
491     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
492       .legalFor({S32, S16}) // Clamp modifier
493       .minScalar(0, S16)
494       .scalarize(0)
495       .widenScalarToNextPow2(0, 16)
496       .lower();
497 
498     // We're just lowering this, but it helps get a better result to try to
499     // coerce to the desired type first.
500     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
501       .minScalar(0, S16)
502       .scalarize(0)
503       .lower();
504   } else {
505     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
506       .legalFor({S32})
507       .clampScalar(0, S32, S32)
508       .scalarize(0);
509 
510     if (ST.hasIntClamp()) {
511       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
512         .legalFor({S32}) // Clamp modifier.
513         .scalarize(0)
514         .minScalarOrElt(0, S32)
515         .lower();
516     } else {
517       // Clamp bit support was added in VI, along with 16-bit operations.
518       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
519         .minScalar(0, S32)
520         .scalarize(0)
521         .lower();
522     }
523 
524     // FIXME: DAG expansion gets better results. The widening uses the smaller
525     // range values and goes for the min/max lowering directly.
526     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
527       .minScalar(0, S32)
528       .scalarize(0)
529       .lower();
530   }
531 
532   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
533     .customFor({S32, S64})
534     .clampScalar(0, S32, S64)
535     .widenScalarToNextPow2(0, 32)
536     .scalarize(0);
537 
538   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
539     .legalFor({S32})
540     .clampScalar(0, S32, S32)
541     .scalarize(0);
542 
543   // Report legal for any types we can handle anywhere. For the cases only legal
544   // on the SALU, RegBankSelect will be able to re-legalize.
545   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
546     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
547     .clampScalar(0, S32, S64)
548     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
549     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
550     .widenScalarToNextPow2(0)
551     .scalarize(0);
552 
553   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
554                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
555     .legalFor({{S32, S1}, {S32, S32}})
556     .minScalar(0, S32)
557     // TODO: .scalarize(0)
558     .lower();
559 
560   getActionDefinitionsBuilder(G_BITCAST)
561     // Don't worry about the size constraint.
562     .legalIf(all(isRegisterType(0), isRegisterType(1)))
563     .lower();
564 
565 
566   getActionDefinitionsBuilder(G_CONSTANT)
567     .legalFor({S1, S32, S64, S16, GlobalPtr,
568                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
569     .legalIf(isPointer(0))
570     .clampScalar(0, S32, S64)
571     .widenScalarToNextPow2(0);
572 
573   getActionDefinitionsBuilder(G_FCONSTANT)
574     .legalFor({S32, S64, S16})
575     .clampScalar(0, S16, S64);
576 
577   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
578       .legalIf(isRegisterType(0))
579       // s1 and s16 are special cases because they have legal operations on
580       // them, but don't really occupy registers in the normal way.
581       .legalFor({S1, S16})
582       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
583       .clampScalarOrElt(0, S32, MaxScalar)
584       .widenScalarToNextPow2(0, 32)
585       .clampMaxNumElements(0, S32, 16);
586 
587   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
588 
589   // If the amount is divergent, we have to do a wave reduction to get the
590   // maximum value, so this is expanded during RegBankSelect.
591   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
592     .legalFor({{PrivatePtr, S32}});
593 
594   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
595     .customIf(typeIsNot(0, PrivatePtr));
596 
597   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
598 
599   auto &FPOpActions = getActionDefinitionsBuilder(
600     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
601     .legalFor({S32, S64});
602   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
603     .customFor({S32, S64});
604   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
605     .customFor({S32, S64});
606 
607   if (ST.has16BitInsts()) {
608     if (ST.hasVOP3PInsts())
609       FPOpActions.legalFor({S16, V2S16});
610     else
611       FPOpActions.legalFor({S16});
612 
613     TrigActions.customFor({S16});
614     FDIVActions.customFor({S16});
615   }
616 
617   auto &MinNumMaxNum = getActionDefinitionsBuilder({
618       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
619 
620   if (ST.hasVOP3PInsts()) {
621     MinNumMaxNum.customFor(FPTypesPK16)
622       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
623       .clampMaxNumElements(0, S16, 2)
624       .clampScalar(0, S16, S64)
625       .scalarize(0);
626   } else if (ST.has16BitInsts()) {
627     MinNumMaxNum.customFor(FPTypes16)
628       .clampScalar(0, S16, S64)
629       .scalarize(0);
630   } else {
631     MinNumMaxNum.customFor(FPTypesBase)
632       .clampScalar(0, S32, S64)
633       .scalarize(0);
634   }
635 
636   if (ST.hasVOP3PInsts())
637     FPOpActions.clampMaxNumElements(0, S16, 2);
638 
639   FPOpActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   TrigActions
644     .scalarize(0)
645     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
646 
647   FDIVActions
648     .scalarize(0)
649     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
650 
651   getActionDefinitionsBuilder({G_FNEG, G_FABS})
652     .legalFor(FPTypesPK16)
653     .clampMaxNumElements(0, S16, 2)
654     .scalarize(0)
655     .clampScalar(0, S16, S64);
656 
657   if (ST.has16BitInsts()) {
658     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
659       .legalFor({S32, S64, S16})
660       .scalarize(0)
661       .clampScalar(0, S16, S64);
662   } else {
663     getActionDefinitionsBuilder(G_FSQRT)
664       .legalFor({S32, S64})
665       .scalarize(0)
666       .clampScalar(0, S32, S64);
667 
668     if (ST.hasFractBug()) {
669       getActionDefinitionsBuilder(G_FFLOOR)
670         .customFor({S64})
671         .legalFor({S32, S64})
672         .scalarize(0)
673         .clampScalar(0, S32, S64);
674     } else {
675       getActionDefinitionsBuilder(G_FFLOOR)
676         .legalFor({S32, S64})
677         .scalarize(0)
678         .clampScalar(0, S32, S64);
679     }
680   }
681 
682   getActionDefinitionsBuilder(G_FPTRUNC)
683     .legalFor({{S32, S64}, {S16, S32}})
684     .scalarize(0)
685     .lower();
686 
687   getActionDefinitionsBuilder(G_FPEXT)
688     .legalFor({{S64, S32}, {S32, S16}})
689     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
690     .scalarize(0);
691 
692   getActionDefinitionsBuilder(G_FSUB)
693       // Use actual fsub instruction
694       .legalFor({S32})
695       // Must use fadd + fneg
696       .lowerFor({S64, S16, V2S16})
697       .scalarize(0)
698       .clampScalar(0, S32, S64);
699 
700   // Whether this is legal depends on the floating point mode for the function.
701   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
702   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
703     FMad.customFor({S32, S16});
704   else if (ST.hasMadMacF32Insts())
705     FMad.customFor({S32});
706   else if (ST.hasMadF16())
707     FMad.customFor({S16});
708   FMad.scalarize(0)
709       .lower();
710 
711   // TODO: Do we need to clamp maximum bitwidth?
712   getActionDefinitionsBuilder(G_TRUNC)
713     .legalIf(isScalar(0))
714     .legalFor({{V2S16, V2S32}})
715     .clampMaxNumElements(0, S16, 2)
716     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
717     // situations (like an invalid implicit use), we don't want to infinite loop
718     // in the legalizer.
719     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
720     .alwaysLegal();
721 
722   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
723     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
724                {S32, S1}, {S64, S1}, {S16, S1}})
725     .scalarize(0)
726     .clampScalar(0, S32, S64)
727     .widenScalarToNextPow2(1, 32);
728 
729   // TODO: Split s1->s64 during regbankselect for VALU.
730   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
731     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
732     .lowerFor({{S32, S64}})
733     .lowerIf(typeIs(1, S1))
734     .customFor({{S64, S64}});
735   if (ST.has16BitInsts())
736     IToFP.legalFor({{S16, S16}});
737   IToFP.clampScalar(1, S32, S64)
738        .minScalar(0, S32)
739        .scalarize(0)
740        .widenScalarToNextPow2(1);
741 
742   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
743     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
744     .customFor({{S64, S64}})
745     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
746   if (ST.has16BitInsts())
747     FPToI.legalFor({{S16, S16}});
748   else
749     FPToI.minScalar(1, S32);
750 
751   FPToI.minScalar(0, S32)
752        .scalarize(0)
753        .lower();
754 
755   // Lower roundeven into G_FRINT
756   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
757     .scalarize(0)
758     .lower();
759 
760   if (ST.has16BitInsts()) {
761     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
762       .legalFor({S16, S32, S64})
763       .clampScalar(0, S16, S64)
764       .scalarize(0);
765   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
766     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
767       .legalFor({S32, S64})
768       .clampScalar(0, S32, S64)
769       .scalarize(0);
770   } else {
771     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
772       .legalFor({S32})
773       .customFor({S64})
774       .clampScalar(0, S32, S64)
775       .scalarize(0);
776   }
777 
778   getActionDefinitionsBuilder(G_PTR_ADD)
779     .legalIf(all(isPointer(0), sameSize(0, 1)))
780     .scalarize(0)
781     .scalarSameSizeAs(1, 0);
782 
783   getActionDefinitionsBuilder(G_PTRMASK)
784     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
785     .scalarSameSizeAs(1, 0)
786     .scalarize(0);
787 
788   auto &CmpBuilder =
789     getActionDefinitionsBuilder(G_ICMP)
790     // The compare output type differs based on the register bank of the output,
791     // so make both s1 and s32 legal.
792     //
793     // Scalar compares producing output in scc will be promoted to s32, as that
794     // is the allocatable register type that will be needed for the copy from
795     // scc. This will be promoted during RegBankSelect, and we assume something
796     // before that won't try to use s32 result types.
797     //
798     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
799     // bank.
800     .legalForCartesianProduct(
801       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
802     .legalForCartesianProduct(
803       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
804   if (ST.has16BitInsts()) {
805     CmpBuilder.legalFor({{S1, S16}});
806   }
807 
808   CmpBuilder
809     .widenScalarToNextPow2(1)
810     .clampScalar(1, S32, S64)
811     .scalarize(0)
812     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
813 
814   getActionDefinitionsBuilder(G_FCMP)
815     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
816     .widenScalarToNextPow2(1)
817     .clampScalar(1, S32, S64)
818     .scalarize(0);
819 
820   // FIXME: fpow has a selection pattern that should move to custom lowering.
821   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
822   if (ST.has16BitInsts())
823     Exp2Ops.legalFor({S32, S16});
824   else
825     Exp2Ops.legalFor({S32});
826   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
827   Exp2Ops.scalarize(0);
828 
829   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
830   if (ST.has16BitInsts())
831     ExpOps.customFor({{S32}, {S16}});
832   else
833     ExpOps.customFor({S32});
834   ExpOps.clampScalar(0, MinScalarFPTy, S32)
835         .scalarize(0);
836 
837   getActionDefinitionsBuilder(G_FPOWI)
838     .clampScalar(0, MinScalarFPTy, S32)
839     .lower();
840 
841   // The 64-bit versions produce 32-bit results, but only on the SALU.
842   getActionDefinitionsBuilder(G_CTPOP)
843     .legalFor({{S32, S32}, {S32, S64}})
844     .clampScalar(0, S32, S32)
845     .clampScalar(1, S32, S64)
846     .scalarize(0)
847     .widenScalarToNextPow2(0, 32)
848     .widenScalarToNextPow2(1, 32);
849 
850   // The hardware instructions return a different result on 0 than the generic
851   // instructions expect. The hardware produces -1, but these produce the
852   // bitwidth.
853   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
854     .scalarize(0)
855     .clampScalar(0, S32, S32)
856     .clampScalar(1, S32, S64)
857     .widenScalarToNextPow2(0, 32)
858     .widenScalarToNextPow2(1, 32)
859     .lower();
860 
861   // The 64-bit versions produce 32-bit results, but only on the SALU.
862   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
863     .legalFor({{S32, S32}, {S32, S64}})
864     .clampScalar(0, S32, S32)
865     .clampScalar(1, S32, S64)
866     .scalarize(0)
867     .widenScalarToNextPow2(0, 32)
868     .widenScalarToNextPow2(1, 32);
869 
870   getActionDefinitionsBuilder(G_BITREVERSE)
871     .legalFor({S32})
872     .clampScalar(0, S32, S32)
873     .scalarize(0);
874 
875   if (ST.has16BitInsts()) {
876     getActionDefinitionsBuilder(G_BSWAP)
877       .legalFor({S16, S32, V2S16})
878       .clampMaxNumElements(0, S16, 2)
879       // FIXME: Fixing non-power-of-2 before clamp is workaround for
880       // narrowScalar limitation.
881       .widenScalarToNextPow2(0)
882       .clampScalar(0, S16, S32)
883       .scalarize(0);
884 
885     if (ST.hasVOP3PInsts()) {
886       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
887         .legalFor({S32, S16, V2S16})
888         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
889         .clampMaxNumElements(0, S16, 2)
890         .minScalar(0, S16)
891         .widenScalarToNextPow2(0)
892         .scalarize(0)
893         .lower();
894     } else {
895       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
896         .legalFor({S32, S16})
897         .widenScalarToNextPow2(0)
898         .minScalar(0, S16)
899         .scalarize(0)
900         .lower();
901     }
902   } else {
903     // TODO: Should have same legality without v_perm_b32
904     getActionDefinitionsBuilder(G_BSWAP)
905       .legalFor({S32})
906       .lowerIf(scalarNarrowerThan(0, 32))
907       // FIXME: Fixing non-power-of-2 before clamp is workaround for
908       // narrowScalar limitation.
909       .widenScalarToNextPow2(0)
910       .maxScalar(0, S32)
911       .scalarize(0)
912       .lower();
913 
914     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
915       .legalFor({S32})
916       .minScalar(0, S32)
917       .widenScalarToNextPow2(0)
918       .scalarize(0)
919       .lower();
920   }
921 
922   getActionDefinitionsBuilder(G_INTTOPTR)
923     // List the common cases
924     .legalForCartesianProduct(AddrSpaces64, {S64})
925     .legalForCartesianProduct(AddrSpaces32, {S32})
926     .scalarize(0)
927     // Accept any address space as long as the size matches
928     .legalIf(sameSize(0, 1))
929     .widenScalarIf(smallerThan(1, 0),
930       [](const LegalityQuery &Query) {
931         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
932       })
933     .narrowScalarIf(largerThan(1, 0),
934       [](const LegalityQuery &Query) {
935         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
936       });
937 
938   getActionDefinitionsBuilder(G_PTRTOINT)
939     // List the common cases
940     .legalForCartesianProduct(AddrSpaces64, {S64})
941     .legalForCartesianProduct(AddrSpaces32, {S32})
942     .scalarize(0)
943     // Accept any address space as long as the size matches
944     .legalIf(sameSize(0, 1))
945     .widenScalarIf(smallerThan(0, 1),
946       [](const LegalityQuery &Query) {
947         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
948       })
949     .narrowScalarIf(
950       largerThan(0, 1),
951       [](const LegalityQuery &Query) {
952         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
953       });
954 
955   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
956     .scalarize(0)
957     .custom();
958 
959   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
960                                     bool IsLoad) -> bool {
961     const LLT DstTy = Query.Types[0];
962 
963     // Split vector extloads.
964     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
965     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
966 
967     if (MemSize < DstTy.getSizeInBits())
968       MemSize = std::max(MemSize, AlignBits);
969 
970     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
971       return true;
972 
973     const LLT PtrTy = Query.Types[1];
974     unsigned AS = PtrTy.getAddressSpace();
975     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
976       return true;
977 
978     // Catch weird sized loads that don't evenly divide into the access sizes
979     // TODO: May be able to widen depending on alignment etc.
980     unsigned NumRegs = (MemSize + 31) / 32;
981     if (NumRegs == 3) {
982       if (!ST.hasDwordx3LoadStores())
983         return true;
984     } else {
985       // If the alignment allows, these should have been widened.
986       if (!isPowerOf2_32(NumRegs))
987         return true;
988     }
989 
990     if (AlignBits < MemSize) {
991       const SITargetLowering *TLI = ST.getTargetLowering();
992       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
993                                                       Align(AlignBits / 8));
994     }
995 
996     return false;
997   };
998 
999   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1000                                          unsigned Opc) -> bool {
1001     unsigned Size = Query.Types[0].getSizeInBits();
1002     if (isPowerOf2_32(Size))
1003       return false;
1004 
1005     if (Size == 96 && ST.hasDwordx3LoadStores())
1006       return false;
1007 
1008     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1009     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1010       return false;
1011 
1012     unsigned Align = Query.MMODescrs[0].AlignInBits;
1013     unsigned RoundedSize = NextPowerOf2(Size);
1014     return (Align >= RoundedSize);
1015   };
1016 
1017   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1018   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1019   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1020 
1021   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1022   // LDS
1023   // TODO: Unsupported flat for SI.
1024 
1025   for (unsigned Op : {G_LOAD, G_STORE}) {
1026     const bool IsStore = Op == G_STORE;
1027 
1028     auto &Actions = getActionDefinitionsBuilder(Op);
1029     // Explicitly list some common cases.
1030     // TODO: Does this help compile time at all?
1031     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1032                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1033                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1034                                       {S64, GlobalPtr, 64, GlobalAlign32},
1035                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1036                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1037                                       {S32, GlobalPtr, 8, GlobalAlign8},
1038                                       {S32, GlobalPtr, 16, GlobalAlign16},
1039 
1040                                       {S32, LocalPtr, 32, 32},
1041                                       {S64, LocalPtr, 64, 32},
1042                                       {V2S32, LocalPtr, 64, 32},
1043                                       {S32, LocalPtr, 8, 8},
1044                                       {S32, LocalPtr, 16, 16},
1045                                       {V2S16, LocalPtr, 32, 32},
1046 
1047                                       {S32, PrivatePtr, 32, 32},
1048                                       {S32, PrivatePtr, 8, 8},
1049                                       {S32, PrivatePtr, 16, 16},
1050                                       {V2S16, PrivatePtr, 32, 32},
1051 
1052                                       {S32, ConstantPtr, 32, GlobalAlign32},
1053                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1054                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1055                                       {S64, ConstantPtr, 64, GlobalAlign32},
1056                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1057     Actions.legalIf(
1058       [=](const LegalityQuery &Query) -> bool {
1059         return isLoadStoreLegal(ST, Query, Op);
1060       });
1061 
1062     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1063     // 64-bits.
1064     //
1065     // TODO: Should generalize bitcast action into coerce, which will also cover
1066     // inserting addrspacecasts.
1067     Actions.customIf(typeIs(1, Constant32Ptr));
1068 
1069     // Turn any illegal element vectors into something easier to deal
1070     // with. These will ultimately produce 32-bit scalar shifts to extract the
1071     // parts anyway.
1072     //
1073     // For odd 16-bit element vectors, prefer to split those into pieces with
1074     // 16-bit vector parts.
1075     Actions.bitcastIf(
1076       [=](const LegalityQuery &Query) -> bool {
1077         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1078                                           Query.MMODescrs[0].SizeInBits);
1079       }, bitcastToRegisterType(0));
1080 
1081     Actions
1082         .customIf(typeIs(1, Constant32Ptr))
1083         // Widen suitably aligned loads by loading extra elements.
1084         .moreElementsIf([=](const LegalityQuery &Query) {
1085             const LLT Ty = Query.Types[0];
1086             return Op == G_LOAD && Ty.isVector() &&
1087                    shouldWidenLoadResult(Query, Op);
1088           }, moreElementsToNextPow2(0))
1089         .widenScalarIf([=](const LegalityQuery &Query) {
1090             const LLT Ty = Query.Types[0];
1091             return Op == G_LOAD && !Ty.isVector() &&
1092                    shouldWidenLoadResult(Query, Op);
1093           }, widenScalarOrEltToNextPow2(0))
1094         .narrowScalarIf(
1095             [=](const LegalityQuery &Query) -> bool {
1096               return !Query.Types[0].isVector() &&
1097                      needToSplitMemOp(Query, Op == G_LOAD);
1098             },
1099             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1100               const LLT DstTy = Query.Types[0];
1101               const LLT PtrTy = Query.Types[1];
1102 
1103               const unsigned DstSize = DstTy.getSizeInBits();
1104               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1105 
1106               // Split extloads.
1107               if (DstSize > MemSize)
1108                 return std::make_pair(0, LLT::scalar(MemSize));
1109 
1110               if (!isPowerOf2_32(DstSize)) {
1111                 // We're probably decomposing an odd sized store. Try to split
1112                 // to the widest type. TODO: Account for alignment. As-is it
1113                 // should be OK, since the new parts will be further legalized.
1114                 unsigned FloorSize = PowerOf2Floor(DstSize);
1115                 return std::make_pair(0, LLT::scalar(FloorSize));
1116               }
1117 
1118               if (DstSize > 32 && (DstSize % 32 != 0)) {
1119                 // FIXME: Need a way to specify non-extload of larger size if
1120                 // suitably aligned.
1121                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1122               }
1123 
1124               unsigned MaxSize = maxSizeForAddrSpace(ST,
1125                                                      PtrTy.getAddressSpace(),
1126                                                      Op == G_LOAD);
1127               if (MemSize > MaxSize)
1128                 return std::make_pair(0, LLT::scalar(MaxSize));
1129 
1130               unsigned Align = Query.MMODescrs[0].AlignInBits;
1131               return std::make_pair(0, LLT::scalar(Align));
1132             })
1133         .fewerElementsIf(
1134             [=](const LegalityQuery &Query) -> bool {
1135               return Query.Types[0].isVector() &&
1136                      needToSplitMemOp(Query, Op == G_LOAD);
1137             },
1138             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1139               const LLT DstTy = Query.Types[0];
1140               const LLT PtrTy = Query.Types[1];
1141 
1142               LLT EltTy = DstTy.getElementType();
1143               unsigned MaxSize = maxSizeForAddrSpace(ST,
1144                                                      PtrTy.getAddressSpace(),
1145                                                      Op == G_LOAD);
1146 
1147               // FIXME: Handle widened to power of 2 results better. This ends
1148               // up scalarizing.
1149               // FIXME: 3 element stores scalarized on SI
1150 
1151               // Split if it's too large for the address space.
1152               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1153                 unsigned NumElts = DstTy.getNumElements();
1154                 unsigned EltSize = EltTy.getSizeInBits();
1155 
1156                 if (MaxSize % EltSize == 0) {
1157                   return std::make_pair(
1158                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1159                 }
1160 
1161                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1162 
1163                 // FIXME: Refine when odd breakdowns handled
1164                 // The scalars will need to be re-legalized.
1165                 if (NumPieces == 1 || NumPieces >= NumElts ||
1166                     NumElts % NumPieces != 0)
1167                   return std::make_pair(0, EltTy);
1168 
1169                 return std::make_pair(0,
1170                                       LLT::vector(NumElts / NumPieces, EltTy));
1171               }
1172 
1173               // FIXME: We could probably handle weird extending loads better.
1174               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1175               if (DstTy.getSizeInBits() > MemSize)
1176                 return std::make_pair(0, EltTy);
1177 
1178               unsigned EltSize = EltTy.getSizeInBits();
1179               unsigned DstSize = DstTy.getSizeInBits();
1180               if (!isPowerOf2_32(DstSize)) {
1181                 // We're probably decomposing an odd sized store. Try to split
1182                 // to the widest type. TODO: Account for alignment. As-is it
1183                 // should be OK, since the new parts will be further legalized.
1184                 unsigned FloorSize = PowerOf2Floor(DstSize);
1185                 return std::make_pair(
1186                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1187               }
1188 
1189               // Need to split because of alignment.
1190               unsigned Align = Query.MMODescrs[0].AlignInBits;
1191               if (EltSize > Align &&
1192                   (EltSize / Align < DstTy.getNumElements())) {
1193                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1194               }
1195 
1196               // May need relegalization for the scalars.
1197               return std::make_pair(0, EltTy);
1198             })
1199         .minScalar(0, S32);
1200 
1201     if (IsStore)
1202       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1203 
1204     // TODO: Need a bitcast lower option?
1205     Actions
1206         .widenScalarToNextPow2(0)
1207         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1208   }
1209 
1210   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1211                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1212                                                   {S32, GlobalPtr, 16, 2 * 8},
1213                                                   {S32, LocalPtr, 8, 8},
1214                                                   {S32, LocalPtr, 16, 16},
1215                                                   {S32, PrivatePtr, 8, 8},
1216                                                   {S32, PrivatePtr, 16, 16},
1217                                                   {S32, ConstantPtr, 8, 8},
1218                                                   {S32, ConstantPtr, 16, 2 * 8}});
1219   if (ST.hasFlatAddressSpace()) {
1220     ExtLoads.legalForTypesWithMemDesc(
1221         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1222   }
1223 
1224   ExtLoads.clampScalar(0, S32, S32)
1225           .widenScalarToNextPow2(0)
1226           .unsupportedIfMemSizeNotPow2()
1227           .lower();
1228 
1229   auto &Atomics = getActionDefinitionsBuilder(
1230     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1231      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1232      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1233      G_ATOMICRMW_UMIN})
1234     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1235                {S64, GlobalPtr}, {S64, LocalPtr},
1236                {S32, RegionPtr}, {S64, RegionPtr}});
1237   if (ST.hasFlatAddressSpace()) {
1238     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1239   }
1240 
1241   if (ST.hasLDSFPAtomics()) {
1242     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1243       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1244   }
1245 
1246   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1247   // demarshalling
1248   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1249     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1250                 {S32, FlatPtr}, {S64, FlatPtr}})
1251     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1252                {S32, RegionPtr}, {S64, RegionPtr}});
1253   // TODO: Pointer types, any 32-bit or 64-bit vector
1254 
1255   // Condition should be s32 for scalar, s1 for vector.
1256   getActionDefinitionsBuilder(G_SELECT)
1257     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1258           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1259           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1260     .clampScalar(0, S16, S64)
1261     .scalarize(1)
1262     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1263     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1264     .clampMaxNumElements(0, S32, 2)
1265     .clampMaxNumElements(0, LocalPtr, 2)
1266     .clampMaxNumElements(0, PrivatePtr, 2)
1267     .scalarize(0)
1268     .widenScalarToNextPow2(0)
1269     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1270 
1271   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1272   // be more flexible with the shift amount type.
1273   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1274     .legalFor({{S32, S32}, {S64, S32}});
1275   if (ST.has16BitInsts()) {
1276     if (ST.hasVOP3PInsts()) {
1277       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1278             .clampMaxNumElements(0, S16, 2);
1279     } else
1280       Shifts.legalFor({{S16, S16}});
1281 
1282     // TODO: Support 16-bit shift amounts for all types
1283     Shifts.widenScalarIf(
1284       [=](const LegalityQuery &Query) {
1285         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1286         // 32-bit amount.
1287         const LLT ValTy = Query.Types[0];
1288         const LLT AmountTy = Query.Types[1];
1289         return ValTy.getSizeInBits() <= 16 &&
1290                AmountTy.getSizeInBits() < 16;
1291       }, changeTo(1, S16));
1292     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1293     Shifts.clampScalar(1, S32, S32);
1294     Shifts.clampScalar(0, S16, S64);
1295     Shifts.widenScalarToNextPow2(0, 16);
1296 
1297     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1298       .minScalar(0, S16)
1299       .scalarize(0)
1300       .lower();
1301   } else {
1302     // Make sure we legalize the shift amount type first, as the general
1303     // expansion for the shifted type will produce much worse code if it hasn't
1304     // been truncated already.
1305     Shifts.clampScalar(1, S32, S32);
1306     Shifts.clampScalar(0, S32, S64);
1307     Shifts.widenScalarToNextPow2(0, 32);
1308 
1309     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1310       .minScalar(0, S32)
1311       .scalarize(0)
1312       .lower();
1313   }
1314   Shifts.scalarize(0);
1315 
1316   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1317     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1318     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1319     unsigned IdxTypeIdx = 2;
1320 
1321     getActionDefinitionsBuilder(Op)
1322       .customIf([=](const LegalityQuery &Query) {
1323           const LLT EltTy = Query.Types[EltTypeIdx];
1324           const LLT VecTy = Query.Types[VecTypeIdx];
1325           const LLT IdxTy = Query.Types[IdxTypeIdx];
1326           const unsigned EltSize = EltTy.getSizeInBits();
1327           return (EltSize == 32 || EltSize == 64) &&
1328                   VecTy.getSizeInBits() % 32 == 0 &&
1329                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1330                   IdxTy.getSizeInBits() == 32;
1331         })
1332       .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
1333                  bitcastToVectorElement32(1))
1334       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1335       .bitcastIf(
1336         all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
1337         [=](const LegalityQuery &Query) {
1338           // For > 64-bit element types, try to turn this into a 64-bit
1339           // element vector since we may be able to do better indexing
1340           // if this is scalar. If not, fall back to 32.
1341           const LLT EltTy = Query.Types[EltTypeIdx];
1342           const LLT VecTy = Query.Types[VecTypeIdx];
1343           const unsigned DstEltSize = EltTy.getSizeInBits();
1344           const unsigned VecSize = VecTy.getSizeInBits();
1345 
1346           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1347           return std::make_pair(
1348             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1349         })
1350       .clampScalar(EltTypeIdx, S32, S64)
1351       .clampScalar(VecTypeIdx, S32, S64)
1352       .clampScalar(IdxTypeIdx, S32, S32)
1353       .clampMaxNumElements(1, S32, 32)
1354       // TODO: Clamp elements for 64-bit vectors?
1355       // It should only be necessary with variable indexes.
1356       // As a last resort, lower to the stack
1357       .lower();
1358   }
1359 
1360   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1361     .unsupportedIf([=](const LegalityQuery &Query) {
1362         const LLT &EltTy = Query.Types[1].getElementType();
1363         return Query.Types[0] != EltTy;
1364       });
1365 
1366   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1367     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1368     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1369 
1370     // FIXME: Doesn't handle extract of illegal sizes.
1371     getActionDefinitionsBuilder(Op)
1372       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1373       // FIXME: Multiples of 16 should not be legal.
1374       .legalIf([=](const LegalityQuery &Query) {
1375           const LLT BigTy = Query.Types[BigTyIdx];
1376           const LLT LitTy = Query.Types[LitTyIdx];
1377           return (BigTy.getSizeInBits() % 32 == 0) &&
1378                  (LitTy.getSizeInBits() % 16 == 0);
1379         })
1380       .widenScalarIf(
1381         [=](const LegalityQuery &Query) {
1382           const LLT BigTy = Query.Types[BigTyIdx];
1383           return (BigTy.getScalarSizeInBits() < 16);
1384         },
1385         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1386       .widenScalarIf(
1387         [=](const LegalityQuery &Query) {
1388           const LLT LitTy = Query.Types[LitTyIdx];
1389           return (LitTy.getScalarSizeInBits() < 16);
1390         },
1391         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1392       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1393       .widenScalarToNextPow2(BigTyIdx, 32);
1394 
1395   }
1396 
1397   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1398     .legalForCartesianProduct(AllS32Vectors, {S32})
1399     .legalForCartesianProduct(AllS64Vectors, {S64})
1400     .clampNumElements(0, V16S32, V32S32)
1401     .clampNumElements(0, V2S64, V16S64)
1402     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1403 
1404   if (ST.hasScalarPackInsts()) {
1405     BuildVector
1406       // FIXME: Should probably widen s1 vectors straight to s32
1407       .minScalarOrElt(0, S16)
1408       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1409       .minScalar(1, S32);
1410 
1411     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1412       .legalFor({V2S16, S32})
1413       .lower();
1414     BuildVector.minScalarOrElt(0, S32);
1415   } else {
1416     BuildVector.customFor({V2S16, S16});
1417     BuildVector.minScalarOrElt(0, S32);
1418 
1419     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1420       .customFor({V2S16, S32})
1421       .lower();
1422   }
1423 
1424   BuildVector.legalIf(isRegisterType(0));
1425 
1426   // FIXME: Clamp maximum size
1427   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1428     .legalIf(isRegisterType(0));
1429 
1430   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1431   // pre-legalize.
1432   if (ST.hasVOP3PInsts()) {
1433     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1434       .customFor({V2S16, V2S16})
1435       .lower();
1436   } else
1437     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1438 
1439   // Merge/Unmerge
1440   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1441     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1442     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1443 
1444     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1445       const LLT Ty = Query.Types[TypeIdx];
1446       if (Ty.isVector()) {
1447         const LLT &EltTy = Ty.getElementType();
1448         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1449           return true;
1450         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1451           return true;
1452       }
1453       return false;
1454     };
1455 
1456     auto &Builder = getActionDefinitionsBuilder(Op)
1457       .lowerFor({{S16, V2S16}})
1458       .lowerIf([=](const LegalityQuery &Query) {
1459           const LLT BigTy = Query.Types[BigTyIdx];
1460           return BigTy.getSizeInBits() == 32;
1461         })
1462       // Try to widen to s16 first for small types.
1463       // TODO: Only do this on targets with legal s16 shifts
1464       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1465       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1466       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1467       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1468                            elementTypeIs(1, S16)),
1469                        changeTo(1, V2S16))
1470       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1471       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1472       // valid.
1473       .clampScalar(LitTyIdx, S32, S512)
1474       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1475       // Break up vectors with weird elements into scalars
1476       .fewerElementsIf(
1477         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1478         scalarize(0))
1479       .fewerElementsIf(
1480         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1481         scalarize(1))
1482       .clampScalar(BigTyIdx, S32, MaxScalar);
1483 
1484     if (Op == G_MERGE_VALUES) {
1485       Builder.widenScalarIf(
1486         // TODO: Use 16-bit shifts if legal for 8-bit values?
1487         [=](const LegalityQuery &Query) {
1488           const LLT Ty = Query.Types[LitTyIdx];
1489           return Ty.getSizeInBits() < 32;
1490         },
1491         changeTo(LitTyIdx, S32));
1492     }
1493 
1494     Builder.widenScalarIf(
1495       [=](const LegalityQuery &Query) {
1496         const LLT Ty = Query.Types[BigTyIdx];
1497         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1498           Ty.getSizeInBits() % 16 != 0;
1499       },
1500       [=](const LegalityQuery &Query) {
1501         // Pick the next power of 2, or a multiple of 64 over 128.
1502         // Whichever is smaller.
1503         const LLT &Ty = Query.Types[BigTyIdx];
1504         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1505         if (NewSizeInBits >= 256) {
1506           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1507           if (RoundedTo < NewSizeInBits)
1508             NewSizeInBits = RoundedTo;
1509         }
1510         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1511       })
1512       .legalIf([=](const LegalityQuery &Query) {
1513           const LLT &BigTy = Query.Types[BigTyIdx];
1514           const LLT &LitTy = Query.Types[LitTyIdx];
1515 
1516           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1517             return false;
1518           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1519             return false;
1520 
1521           return BigTy.getSizeInBits() % 16 == 0 &&
1522                  LitTy.getSizeInBits() % 16 == 0 &&
1523                  BigTy.getSizeInBits() <= MaxRegisterSize;
1524         })
1525       // Any vectors left are the wrong size. Scalarize them.
1526       .scalarize(0)
1527       .scalarize(1);
1528   }
1529 
1530   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1531   // RegBankSelect.
1532   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1533     .legalFor({{S32}, {S64}});
1534 
1535   if (ST.hasVOP3PInsts()) {
1536     SextInReg.lowerFor({{V2S16}})
1537       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1538       // get more vector shift opportunities, since we'll get those when
1539       // expanded.
1540       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1541   } else if (ST.has16BitInsts()) {
1542     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1543   } else {
1544     // Prefer to promote to s32 before lowering if we don't have 16-bit
1545     // shifts. This avoid a lot of intermediate truncate and extend operations.
1546     SextInReg.lowerFor({{S32}, {S64}});
1547   }
1548 
1549   SextInReg
1550     .scalarize(0)
1551     .clampScalar(0, S32, S64)
1552     .lower();
1553 
1554   getActionDefinitionsBuilder(G_FSHR)
1555     .legalFor({{S32, S32}})
1556     .scalarize(0)
1557     .lower();
1558 
1559   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1560     .legalFor({S64});
1561 
1562   getActionDefinitionsBuilder(G_FENCE)
1563     .alwaysLegal();
1564 
1565   getActionDefinitionsBuilder({
1566       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1567       G_FCOPYSIGN,
1568 
1569       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1570       G_ATOMICRMW_NAND,
1571       G_ATOMICRMW_FSUB,
1572       G_READ_REGISTER,
1573       G_WRITE_REGISTER,
1574 
1575       G_SADDO, G_SSUBO,
1576 
1577        // TODO: Implement
1578       G_FMINIMUM, G_FMAXIMUM,
1579       G_FSHL
1580     }).lower();
1581 
1582   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1583         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1584         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1585     .unsupported();
1586 
1587   computeTables();
1588   verify(*ST.getInstrInfo());
1589 }
1590 
1591 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1592                                          MachineInstr &MI) const {
1593   MachineIRBuilder &B = Helper.MIRBuilder;
1594   MachineRegisterInfo &MRI = *B.getMRI();
1595   GISelChangeObserver &Observer = Helper.Observer;
1596 
1597   switch (MI.getOpcode()) {
1598   case TargetOpcode::G_ADDRSPACE_CAST:
1599     return legalizeAddrSpaceCast(MI, MRI, B);
1600   case TargetOpcode::G_FRINT:
1601     return legalizeFrint(MI, MRI, B);
1602   case TargetOpcode::G_FCEIL:
1603     return legalizeFceil(MI, MRI, B);
1604   case TargetOpcode::G_INTRINSIC_TRUNC:
1605     return legalizeIntrinsicTrunc(MI, MRI, B);
1606   case TargetOpcode::G_SITOFP:
1607     return legalizeITOFP(MI, MRI, B, true);
1608   case TargetOpcode::G_UITOFP:
1609     return legalizeITOFP(MI, MRI, B, false);
1610   case TargetOpcode::G_FPTOSI:
1611     return legalizeFPTOI(MI, MRI, B, true);
1612   case TargetOpcode::G_FPTOUI:
1613     return legalizeFPTOI(MI, MRI, B, false);
1614   case TargetOpcode::G_FMINNUM:
1615   case TargetOpcode::G_FMAXNUM:
1616   case TargetOpcode::G_FMINNUM_IEEE:
1617   case TargetOpcode::G_FMAXNUM_IEEE:
1618     return legalizeMinNumMaxNum(Helper, MI);
1619   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1620     return legalizeExtractVectorElt(MI, MRI, B);
1621   case TargetOpcode::G_INSERT_VECTOR_ELT:
1622     return legalizeInsertVectorElt(MI, MRI, B);
1623   case TargetOpcode::G_SHUFFLE_VECTOR:
1624     return legalizeShuffleVector(MI, MRI, B);
1625   case TargetOpcode::G_FSIN:
1626   case TargetOpcode::G_FCOS:
1627     return legalizeSinCos(MI, MRI, B);
1628   case TargetOpcode::G_GLOBAL_VALUE:
1629     return legalizeGlobalValue(MI, MRI, B);
1630   case TargetOpcode::G_LOAD:
1631     return legalizeLoad(MI, MRI, B, Observer);
1632   case TargetOpcode::G_FMAD:
1633     return legalizeFMad(MI, MRI, B);
1634   case TargetOpcode::G_FDIV:
1635     return legalizeFDIV(MI, MRI, B);
1636   case TargetOpcode::G_UDIV:
1637   case TargetOpcode::G_UREM:
1638     return legalizeUDIV_UREM(MI, MRI, B);
1639   case TargetOpcode::G_SDIV:
1640   case TargetOpcode::G_SREM:
1641     return legalizeSDIV_SREM(MI, MRI, B);
1642   case TargetOpcode::G_ATOMIC_CMPXCHG:
1643     return legalizeAtomicCmpXChg(MI, MRI, B);
1644   case TargetOpcode::G_FLOG:
1645     return legalizeFlog(MI, B, numbers::ln2f);
1646   case TargetOpcode::G_FLOG10:
1647     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1648   case TargetOpcode::G_FEXP:
1649     return legalizeFExp(MI, B);
1650   case TargetOpcode::G_FPOW:
1651     return legalizeFPow(MI, B);
1652   case TargetOpcode::G_FFLOOR:
1653     return legalizeFFloor(MI, MRI, B);
1654   case TargetOpcode::G_BUILD_VECTOR:
1655     return legalizeBuildVector(MI, MRI, B);
1656   default:
1657     return false;
1658   }
1659 
1660   llvm_unreachable("expected switch to return");
1661 }
1662 
1663 Register AMDGPULegalizerInfo::getSegmentAperture(
1664   unsigned AS,
1665   MachineRegisterInfo &MRI,
1666   MachineIRBuilder &B) const {
1667   MachineFunction &MF = B.getMF();
1668   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1669   const LLT S32 = LLT::scalar(32);
1670 
1671   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1672 
1673   if (ST.hasApertureRegs()) {
1674     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1675     // getreg.
1676     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1677         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1678         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1679     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1680         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1681         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1682     unsigned Encoding =
1683         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1684         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1685         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1686 
1687     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1688 
1689     B.buildInstr(AMDGPU::S_GETREG_B32)
1690       .addDef(GetReg)
1691       .addImm(Encoding);
1692     MRI.setType(GetReg, S32);
1693 
1694     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1695     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1696   }
1697 
1698   Register QueuePtr = MRI.createGenericVirtualRegister(
1699     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1700 
1701   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1702     return Register();
1703 
1704   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1705   // private_segment_aperture_base_hi.
1706   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1707 
1708   // TODO: can we be smarter about machine pointer info?
1709   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1710   MachineMemOperand *MMO = MF.getMachineMemOperand(
1711       PtrInfo,
1712       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1713           MachineMemOperand::MOInvariant,
1714       4, commonAlignment(Align(64), StructOffset));
1715 
1716   Register LoadAddr;
1717 
1718   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1719   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1720 }
1721 
1722 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1723   MachineInstr &MI, MachineRegisterInfo &MRI,
1724   MachineIRBuilder &B) const {
1725   MachineFunction &MF = B.getMF();
1726 
1727   const LLT S32 = LLT::scalar(32);
1728   Register Dst = MI.getOperand(0).getReg();
1729   Register Src = MI.getOperand(1).getReg();
1730 
1731   LLT DstTy = MRI.getType(Dst);
1732   LLT SrcTy = MRI.getType(Src);
1733   unsigned DestAS = DstTy.getAddressSpace();
1734   unsigned SrcAS = SrcTy.getAddressSpace();
1735 
1736   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1737   // vector element.
1738   assert(!DstTy.isVector());
1739 
1740   const AMDGPUTargetMachine &TM
1741     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1742 
1743   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1744     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1745     return true;
1746   }
1747 
1748   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1749     // Truncate.
1750     B.buildExtract(Dst, Src, 0);
1751     MI.eraseFromParent();
1752     return true;
1753   }
1754 
1755   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1756     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1757     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1758 
1759     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1760     // another. Merge operands are required to be the same type, but creating an
1761     // extra ptrtoint would be kind of pointless.
1762     auto HighAddr = B.buildConstant(
1763       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1764     B.buildMerge(Dst, {Src, HighAddr});
1765     MI.eraseFromParent();
1766     return true;
1767   }
1768 
1769   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1770     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1771            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1772     unsigned NullVal = TM.getNullPointerValue(DestAS);
1773 
1774     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1775     auto FlatNull = B.buildConstant(SrcTy, 0);
1776 
1777     // Extract low 32-bits of the pointer.
1778     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1779 
1780     auto CmpRes =
1781         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1782     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1783 
1784     MI.eraseFromParent();
1785     return true;
1786   }
1787 
1788   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1789     return false;
1790 
1791   if (!ST.hasFlatAddressSpace())
1792     return false;
1793 
1794   auto SegmentNull =
1795       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1796   auto FlatNull =
1797       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1798 
1799   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1800   if (!ApertureReg.isValid())
1801     return false;
1802 
1803   auto CmpRes =
1804       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1805 
1806   // Coerce the type of the low half of the result so we can use merge_values.
1807   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1808 
1809   // TODO: Should we allow mismatched types but matching sizes in merges to
1810   // avoid the ptrtoint?
1811   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1812   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1813 
1814   MI.eraseFromParent();
1815   return true;
1816 }
1817 
1818 bool AMDGPULegalizerInfo::legalizeFrint(
1819   MachineInstr &MI, MachineRegisterInfo &MRI,
1820   MachineIRBuilder &B) const {
1821   Register Src = MI.getOperand(1).getReg();
1822   LLT Ty = MRI.getType(Src);
1823   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1824 
1825   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1826   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1827 
1828   auto C1 = B.buildFConstant(Ty, C1Val);
1829   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1830 
1831   // TODO: Should this propagate fast-math-flags?
1832   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1833   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1834 
1835   auto C2 = B.buildFConstant(Ty, C2Val);
1836   auto Fabs = B.buildFAbs(Ty, Src);
1837 
1838   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1839   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1840   MI.eraseFromParent();
1841   return true;
1842 }
1843 
1844 bool AMDGPULegalizerInfo::legalizeFceil(
1845   MachineInstr &MI, MachineRegisterInfo &MRI,
1846   MachineIRBuilder &B) const {
1847 
1848   const LLT S1 = LLT::scalar(1);
1849   const LLT S64 = LLT::scalar(64);
1850 
1851   Register Src = MI.getOperand(1).getReg();
1852   assert(MRI.getType(Src) == S64);
1853 
1854   // result = trunc(src)
1855   // if (src > 0.0 && src != result)
1856   //   result += 1.0
1857 
1858   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1859 
1860   const auto Zero = B.buildFConstant(S64, 0.0);
1861   const auto One = B.buildFConstant(S64, 1.0);
1862   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1863   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1864   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1865   auto Add = B.buildSelect(S64, And, One, Zero);
1866 
1867   // TODO: Should this propagate fast-math-flags?
1868   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1869   return true;
1870 }
1871 
1872 static MachineInstrBuilder extractF64Exponent(Register Hi,
1873                                               MachineIRBuilder &B) {
1874   const unsigned FractBits = 52;
1875   const unsigned ExpBits = 11;
1876   LLT S32 = LLT::scalar(32);
1877 
1878   auto Const0 = B.buildConstant(S32, FractBits - 32);
1879   auto Const1 = B.buildConstant(S32, ExpBits);
1880 
1881   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1882     .addUse(Hi)
1883     .addUse(Const0.getReg(0))
1884     .addUse(Const1.getReg(0));
1885 
1886   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1887 }
1888 
1889 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1890   MachineInstr &MI, MachineRegisterInfo &MRI,
1891   MachineIRBuilder &B) const {
1892   const LLT S1 = LLT::scalar(1);
1893   const LLT S32 = LLT::scalar(32);
1894   const LLT S64 = LLT::scalar(64);
1895 
1896   Register Src = MI.getOperand(1).getReg();
1897   assert(MRI.getType(Src) == S64);
1898 
1899   // TODO: Should this use extract since the low half is unused?
1900   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1901   Register Hi = Unmerge.getReg(1);
1902 
1903   // Extract the upper half, since this is where we will find the sign and
1904   // exponent.
1905   auto Exp = extractF64Exponent(Hi, B);
1906 
1907   const unsigned FractBits = 52;
1908 
1909   // Extract the sign bit.
1910   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1911   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1912 
1913   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1914 
1915   const auto Zero32 = B.buildConstant(S32, 0);
1916 
1917   // Extend back to 64-bits.
1918   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1919 
1920   auto Shr = B.buildAShr(S64, FractMask, Exp);
1921   auto Not = B.buildNot(S64, Shr);
1922   auto Tmp0 = B.buildAnd(S64, Src, Not);
1923   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1924 
1925   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1926   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1927 
1928   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1929   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1930   MI.eraseFromParent();
1931   return true;
1932 }
1933 
1934 bool AMDGPULegalizerInfo::legalizeITOFP(
1935   MachineInstr &MI, MachineRegisterInfo &MRI,
1936   MachineIRBuilder &B, bool Signed) const {
1937 
1938   Register Dst = MI.getOperand(0).getReg();
1939   Register Src = MI.getOperand(1).getReg();
1940 
1941   const LLT S64 = LLT::scalar(64);
1942   const LLT S32 = LLT::scalar(32);
1943 
1944   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1945 
1946   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1947 
1948   auto CvtHi = Signed ?
1949     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1950     B.buildUITOFP(S64, Unmerge.getReg(1));
1951 
1952   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1953 
1954   auto ThirtyTwo = B.buildConstant(S32, 32);
1955   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1956     .addUse(CvtHi.getReg(0))
1957     .addUse(ThirtyTwo.getReg(0));
1958 
1959   // TODO: Should this propagate fast-math-flags?
1960   B.buildFAdd(Dst, LdExp, CvtLo);
1961   MI.eraseFromParent();
1962   return true;
1963 }
1964 
1965 // TODO: Copied from DAG implementation. Verify logic and document how this
1966 // actually works.
1967 bool AMDGPULegalizerInfo::legalizeFPTOI(
1968   MachineInstr &MI, MachineRegisterInfo &MRI,
1969   MachineIRBuilder &B, bool Signed) const {
1970 
1971   Register Dst = MI.getOperand(0).getReg();
1972   Register Src = MI.getOperand(1).getReg();
1973 
1974   const LLT S64 = LLT::scalar(64);
1975   const LLT S32 = LLT::scalar(32);
1976 
1977   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1978 
1979   unsigned Flags = MI.getFlags();
1980 
1981   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1982   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1983   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1984 
1985   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1986   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1987   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1988 
1989   auto Hi = Signed ?
1990     B.buildFPTOSI(S32, FloorMul) :
1991     B.buildFPTOUI(S32, FloorMul);
1992   auto Lo = B.buildFPTOUI(S32, Fma);
1993 
1994   B.buildMerge(Dst, { Lo, Hi });
1995   MI.eraseFromParent();
1996 
1997   return true;
1998 }
1999 
2000 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2001                                                MachineInstr &MI) const {
2002   MachineFunction &MF = Helper.MIRBuilder.getMF();
2003   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2004 
2005   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2006                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2007 
2008   // With ieee_mode disabled, the instructions have the correct behavior
2009   // already for G_FMINNUM/G_FMAXNUM
2010   if (!MFI->getMode().IEEE)
2011     return !IsIEEEOp;
2012 
2013   if (IsIEEEOp)
2014     return true;
2015 
2016   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2017 }
2018 
2019 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2020   MachineInstr &MI, MachineRegisterInfo &MRI,
2021   MachineIRBuilder &B) const {
2022   // TODO: Should move some of this into LegalizerHelper.
2023 
2024   // TODO: Promote dynamic indexing of s16 to s32
2025 
2026   // FIXME: Artifact combiner probably should have replaced the truncated
2027   // constant before this, so we shouldn't need
2028   // getConstantVRegValWithLookThrough.
2029   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2030     MI.getOperand(2).getReg(), MRI);
2031   if (!IdxVal) // Dynamic case will be selected to register indexing.
2032     return true;
2033 
2034   Register Dst = MI.getOperand(0).getReg();
2035   Register Vec = MI.getOperand(1).getReg();
2036 
2037   LLT VecTy = MRI.getType(Vec);
2038   LLT EltTy = VecTy.getElementType();
2039   assert(EltTy == MRI.getType(Dst));
2040 
2041   if (IdxVal->Value < VecTy.getNumElements())
2042     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2043   else
2044     B.buildUndef(Dst);
2045 
2046   MI.eraseFromParent();
2047   return true;
2048 }
2049 
2050 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2051   MachineInstr &MI, MachineRegisterInfo &MRI,
2052   MachineIRBuilder &B) const {
2053   // TODO: Should move some of this into LegalizerHelper.
2054 
2055   // TODO: Promote dynamic indexing of s16 to s32
2056 
2057   // FIXME: Artifact combiner probably should have replaced the truncated
2058   // constant before this, so we shouldn't need
2059   // getConstantVRegValWithLookThrough.
2060   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2061     MI.getOperand(3).getReg(), MRI);
2062   if (!IdxVal) // Dynamic case will be selected to register indexing.
2063     return true;
2064 
2065   Register Dst = MI.getOperand(0).getReg();
2066   Register Vec = MI.getOperand(1).getReg();
2067   Register Ins = MI.getOperand(2).getReg();
2068 
2069   LLT VecTy = MRI.getType(Vec);
2070   LLT EltTy = VecTy.getElementType();
2071   assert(EltTy == MRI.getType(Ins));
2072 
2073   if (IdxVal->Value < VecTy.getNumElements())
2074     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2075   else
2076     B.buildUndef(Dst);
2077 
2078   MI.eraseFromParent();
2079   return true;
2080 }
2081 
2082 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2083   MachineInstr &MI, MachineRegisterInfo &MRI,
2084   MachineIRBuilder &B) const {
2085   const LLT V2S16 = LLT::vector(2, 16);
2086 
2087   Register Dst = MI.getOperand(0).getReg();
2088   Register Src0 = MI.getOperand(1).getReg();
2089   LLT DstTy = MRI.getType(Dst);
2090   LLT SrcTy = MRI.getType(Src0);
2091 
2092   if (SrcTy == V2S16 && DstTy == V2S16 &&
2093       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2094     return true;
2095 
2096   MachineIRBuilder HelperBuilder(MI);
2097   GISelObserverWrapper DummyObserver;
2098   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2099   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2100 }
2101 
2102 bool AMDGPULegalizerInfo::legalizeSinCos(
2103   MachineInstr &MI, MachineRegisterInfo &MRI,
2104   MachineIRBuilder &B) const {
2105 
2106   Register DstReg = MI.getOperand(0).getReg();
2107   Register SrcReg = MI.getOperand(1).getReg();
2108   LLT Ty = MRI.getType(DstReg);
2109   unsigned Flags = MI.getFlags();
2110 
2111   Register TrigVal;
2112   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2113   if (ST.hasTrigReducedRange()) {
2114     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2115     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2116       .addUse(MulVal.getReg(0))
2117       .setMIFlags(Flags).getReg(0);
2118   } else
2119     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2120 
2121   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2122     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2123   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2124     .addUse(TrigVal)
2125     .setMIFlags(Flags);
2126   MI.eraseFromParent();
2127   return true;
2128 }
2129 
2130 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2131                                                   MachineIRBuilder &B,
2132                                                   const GlobalValue *GV,
2133                                                   int64_t Offset,
2134                                                   unsigned GAFlags) const {
2135   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2136   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2137   // to the following code sequence:
2138   //
2139   // For constant address space:
2140   //   s_getpc_b64 s[0:1]
2141   //   s_add_u32 s0, s0, $symbol
2142   //   s_addc_u32 s1, s1, 0
2143   //
2144   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2145   //   a fixup or relocation is emitted to replace $symbol with a literal
2146   //   constant, which is a pc-relative offset from the encoding of the $symbol
2147   //   operand to the global variable.
2148   //
2149   // For global address space:
2150   //   s_getpc_b64 s[0:1]
2151   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2152   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2153   //
2154   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2155   //   fixups or relocations are emitted to replace $symbol@*@lo and
2156   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2157   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2158   //   operand to the global variable.
2159   //
2160   // What we want here is an offset from the value returned by s_getpc
2161   // (which is the address of the s_add_u32 instruction) to the global
2162   // variable, but since the encoding of $symbol starts 4 bytes after the start
2163   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2164   // small. This requires us to add 4 to the global variable offset in order to
2165   // compute the correct address.
2166 
2167   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2168 
2169   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2170     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2171 
2172   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2173     .addDef(PCReg);
2174 
2175   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2176   if (GAFlags == SIInstrInfo::MO_NONE)
2177     MIB.addImm(0);
2178   else
2179     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2180 
2181   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2182 
2183   if (PtrTy.getSizeInBits() == 32)
2184     B.buildExtract(DstReg, PCReg, 0);
2185   return true;
2186  }
2187 
2188 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2189   MachineInstr &MI, MachineRegisterInfo &MRI,
2190   MachineIRBuilder &B) const {
2191   Register DstReg = MI.getOperand(0).getReg();
2192   LLT Ty = MRI.getType(DstReg);
2193   unsigned AS = Ty.getAddressSpace();
2194 
2195   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2196   MachineFunction &MF = B.getMF();
2197   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2198 
2199   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2200     if (!MFI->isEntryFunction()) {
2201       const Function &Fn = MF.getFunction();
2202       DiagnosticInfoUnsupported BadLDSDecl(
2203         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2204         DS_Warning);
2205       Fn.getContext().diagnose(BadLDSDecl);
2206 
2207       // We currently don't have a way to correctly allocate LDS objects that
2208       // aren't directly associated with a kernel. We do force inlining of
2209       // functions that use local objects. However, if these dead functions are
2210       // not eliminated, we don't want a compile time error. Just emit a warning
2211       // and a trap, since there should be no callable path here.
2212       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2213       B.buildUndef(DstReg);
2214       MI.eraseFromParent();
2215       return true;
2216     }
2217 
2218     // TODO: We could emit code to handle the initialization somewhere.
2219     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2220       const SITargetLowering *TLI = ST.getTargetLowering();
2221       if (!TLI->shouldUseLDSConstAddress(GV)) {
2222         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2223         return true; // Leave in place;
2224       }
2225 
2226       B.buildConstant(
2227           DstReg,
2228           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2229       MI.eraseFromParent();
2230       return true;
2231     }
2232 
2233     const Function &Fn = MF.getFunction();
2234     DiagnosticInfoUnsupported BadInit(
2235       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2236     Fn.getContext().diagnose(BadInit);
2237     return true;
2238   }
2239 
2240   const SITargetLowering *TLI = ST.getTargetLowering();
2241 
2242   if (TLI->shouldEmitFixup(GV)) {
2243     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2244     MI.eraseFromParent();
2245     return true;
2246   }
2247 
2248   if (TLI->shouldEmitPCReloc(GV)) {
2249     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2250     MI.eraseFromParent();
2251     return true;
2252   }
2253 
2254   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2255   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2256 
2257   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2258       MachinePointerInfo::getGOT(MF),
2259       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2260           MachineMemOperand::MOInvariant,
2261       8 /*Size*/, Align(8));
2262 
2263   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2264 
2265   if (Ty.getSizeInBits() == 32) {
2266     // Truncate if this is a 32-bit constant adrdess.
2267     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2268     B.buildExtract(DstReg, Load, 0);
2269   } else
2270     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2271 
2272   MI.eraseFromParent();
2273   return true;
2274 }
2275 
2276 bool AMDGPULegalizerInfo::legalizeLoad(
2277   MachineInstr &MI, MachineRegisterInfo &MRI,
2278   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2279   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2280   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2281   Observer.changingInstr(MI);
2282   MI.getOperand(1).setReg(Cast.getReg(0));
2283   Observer.changedInstr(MI);
2284   return true;
2285 }
2286 
2287 bool AMDGPULegalizerInfo::legalizeFMad(
2288   MachineInstr &MI, MachineRegisterInfo &MRI,
2289   MachineIRBuilder &B) const {
2290   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2291   assert(Ty.isScalar());
2292 
2293   MachineFunction &MF = B.getMF();
2294   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2295 
2296   // TODO: Always legal with future ftz flag.
2297   // FIXME: Do we need just output?
2298   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2299     return true;
2300   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2301     return true;
2302 
2303   MachineIRBuilder HelperBuilder(MI);
2304   GISelObserverWrapper DummyObserver;
2305   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2306   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2307 }
2308 
2309 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2310   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2311   Register DstReg = MI.getOperand(0).getReg();
2312   Register PtrReg = MI.getOperand(1).getReg();
2313   Register CmpVal = MI.getOperand(2).getReg();
2314   Register NewVal = MI.getOperand(3).getReg();
2315 
2316   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2317          "this should not have been custom lowered");
2318 
2319   LLT ValTy = MRI.getType(CmpVal);
2320   LLT VecTy = LLT::vector(2, ValTy);
2321 
2322   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2323 
2324   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2325     .addDef(DstReg)
2326     .addUse(PtrReg)
2327     .addUse(PackedVal)
2328     .setMemRefs(MI.memoperands());
2329 
2330   MI.eraseFromParent();
2331   return true;
2332 }
2333 
2334 bool AMDGPULegalizerInfo::legalizeFlog(
2335   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2336   Register Dst = MI.getOperand(0).getReg();
2337   Register Src = MI.getOperand(1).getReg();
2338   LLT Ty = B.getMRI()->getType(Dst);
2339   unsigned Flags = MI.getFlags();
2340 
2341   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2342   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2343 
2344   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2345   MI.eraseFromParent();
2346   return true;
2347 }
2348 
2349 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2350                                        MachineIRBuilder &B) const {
2351   Register Dst = MI.getOperand(0).getReg();
2352   Register Src = MI.getOperand(1).getReg();
2353   unsigned Flags = MI.getFlags();
2354   LLT Ty = B.getMRI()->getType(Dst);
2355 
2356   auto K = B.buildFConstant(Ty, numbers::log2e);
2357   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2358   B.buildFExp2(Dst, Mul, Flags);
2359   MI.eraseFromParent();
2360   return true;
2361 }
2362 
2363 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2364                                        MachineIRBuilder &B) const {
2365   Register Dst = MI.getOperand(0).getReg();
2366   Register Src0 = MI.getOperand(1).getReg();
2367   Register Src1 = MI.getOperand(2).getReg();
2368   unsigned Flags = MI.getFlags();
2369   LLT Ty = B.getMRI()->getType(Dst);
2370   const LLT S16 = LLT::scalar(16);
2371   const LLT S32 = LLT::scalar(32);
2372 
2373   if (Ty == S32) {
2374     auto Log = B.buildFLog2(S32, Src0, Flags);
2375     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2376       .addUse(Log.getReg(0))
2377       .addUse(Src1)
2378       .setMIFlags(Flags);
2379     B.buildFExp2(Dst, Mul, Flags);
2380   } else if (Ty == S16) {
2381     // There's no f16 fmul_legacy, so we need to convert for it.
2382     auto Log = B.buildFLog2(S16, Src0, Flags);
2383     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2384     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2385     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2386       .addUse(Ext0.getReg(0))
2387       .addUse(Ext1.getReg(0))
2388       .setMIFlags(Flags);
2389 
2390     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2391   } else
2392     return false;
2393 
2394   MI.eraseFromParent();
2395   return true;
2396 }
2397 
2398 // Find a source register, ignoring any possible source modifiers.
2399 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2400   Register ModSrc = OrigSrc;
2401   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2402     ModSrc = SrcFNeg->getOperand(1).getReg();
2403     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2404       ModSrc = SrcFAbs->getOperand(1).getReg();
2405   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2406     ModSrc = SrcFAbs->getOperand(1).getReg();
2407   return ModSrc;
2408 }
2409 
2410 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2411                                          MachineRegisterInfo &MRI,
2412                                          MachineIRBuilder &B) const {
2413 
2414   const LLT S1 = LLT::scalar(1);
2415   const LLT S64 = LLT::scalar(64);
2416   Register Dst = MI.getOperand(0).getReg();
2417   Register OrigSrc = MI.getOperand(1).getReg();
2418   unsigned Flags = MI.getFlags();
2419   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2420          "this should not have been custom lowered");
2421 
2422   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2423   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2424   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2425   // V_FRACT bug is:
2426   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2427   //
2428   // Convert floor(x) to (x - fract(x))
2429 
2430   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2431     .addUse(OrigSrc)
2432     .setMIFlags(Flags);
2433 
2434   // Give source modifier matching some assistance before obscuring a foldable
2435   // pattern.
2436 
2437   // TODO: We can avoid the neg on the fract? The input sign to fract
2438   // shouldn't matter?
2439   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2440 
2441   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2442 
2443   Register Min = MRI.createGenericVirtualRegister(S64);
2444 
2445   // We don't need to concern ourselves with the snan handling difference, so
2446   // use the one which will directly select.
2447   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2448   if (MFI->getMode().IEEE)
2449     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2450   else
2451     B.buildFMinNum(Min, Fract, Const, Flags);
2452 
2453   Register CorrectedFract = Min;
2454   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2455     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2456     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2457   }
2458 
2459   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2460   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2461 
2462   MI.eraseFromParent();
2463   return true;
2464 }
2465 
2466 // Turn an illegal packed v2s16 build vector into bit operations.
2467 // TODO: This should probably be a bitcast action in LegalizerHelper.
2468 bool AMDGPULegalizerInfo::legalizeBuildVector(
2469   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2470   Register Dst = MI.getOperand(0).getReg();
2471   const LLT S32 = LLT::scalar(32);
2472   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2473 
2474   Register Src0 = MI.getOperand(1).getReg();
2475   Register Src1 = MI.getOperand(2).getReg();
2476   assert(MRI.getType(Src0) == LLT::scalar(16));
2477 
2478   auto Merge = B.buildMerge(S32, {Src0, Src1});
2479   B.buildBitcast(Dst, Merge);
2480 
2481   MI.eraseFromParent();
2482   return true;
2483 }
2484 
2485 // Return the use branch instruction, otherwise null if the usage is invalid.
2486 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2487                                        MachineRegisterInfo &MRI,
2488                                        MachineInstr *&Br,
2489                                        MachineBasicBlock *&UncondBrTarget) {
2490   Register CondDef = MI.getOperand(0).getReg();
2491   if (!MRI.hasOneNonDBGUse(CondDef))
2492     return nullptr;
2493 
2494   MachineBasicBlock *Parent = MI.getParent();
2495   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2496   if (UseMI.getParent() != Parent ||
2497       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2498     return nullptr;
2499 
2500   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2501   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2502   if (Next == Parent->end()) {
2503     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2504     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2505       return nullptr;
2506     UncondBrTarget = &*NextMBB;
2507   } else {
2508     if (Next->getOpcode() != AMDGPU::G_BR)
2509       return nullptr;
2510     Br = &*Next;
2511     UncondBrTarget = Br->getOperand(0).getMBB();
2512   }
2513 
2514   return &UseMI;
2515 }
2516 
2517 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2518                                          const ArgDescriptor *Arg,
2519                                          const TargetRegisterClass *ArgRC,
2520                                          LLT ArgTy) const {
2521   MCRegister SrcReg = Arg->getRegister();
2522   assert(SrcReg.isPhysical() && "Physical register expected");
2523   assert(DstReg.isVirtual() && "Virtual register expected");
2524 
2525   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2526                                              ArgTy);
2527   if (Arg->isMasked()) {
2528     // TODO: Should we try to emit this once in the entry block?
2529     const LLT S32 = LLT::scalar(32);
2530     const unsigned Mask = Arg->getMask();
2531     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2532 
2533     Register AndMaskSrc = LiveIn;
2534 
2535     if (Shift != 0) {
2536       auto ShiftAmt = B.buildConstant(S32, Shift);
2537       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2538     }
2539 
2540     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2541   } else {
2542     B.buildCopy(DstReg, LiveIn);
2543   }
2544 
2545   return true;
2546 }
2547 
2548 bool AMDGPULegalizerInfo::loadInputValue(
2549     Register DstReg, MachineIRBuilder &B,
2550     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2551   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2552   const ArgDescriptor *Arg;
2553   const TargetRegisterClass *ArgRC;
2554   LLT ArgTy;
2555   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2556 
2557   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2558     return false; // TODO: Handle these
2559   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2560 }
2561 
2562 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2563     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2564     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2565   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2566     return false;
2567 
2568   MI.eraseFromParent();
2569   return true;
2570 }
2571 
2572 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2573                                        MachineRegisterInfo &MRI,
2574                                        MachineIRBuilder &B) const {
2575   Register Dst = MI.getOperand(0).getReg();
2576   LLT DstTy = MRI.getType(Dst);
2577   LLT S16 = LLT::scalar(16);
2578   LLT S32 = LLT::scalar(32);
2579   LLT S64 = LLT::scalar(64);
2580 
2581   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2582     return true;
2583 
2584   if (DstTy == S16)
2585     return legalizeFDIV16(MI, MRI, B);
2586   if (DstTy == S32)
2587     return legalizeFDIV32(MI, MRI, B);
2588   if (DstTy == S64)
2589     return legalizeFDIV64(MI, MRI, B);
2590 
2591   return false;
2592 }
2593 
2594 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2595                                                   Register DstReg,
2596                                                   Register X,
2597                                                   Register Y,
2598                                                   bool IsDiv) const {
2599   const LLT S1 = LLT::scalar(1);
2600   const LLT S32 = LLT::scalar(32);
2601 
2602   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2603   // algorithm used here.
2604 
2605   // Initial estimate of inv(y).
2606   auto FloatY = B.buildUITOFP(S32, Y);
2607   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2608   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2609   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2610   auto Z = B.buildFPTOUI(S32, ScaledY);
2611 
2612   // One round of UNR.
2613   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2614   auto NegYZ = B.buildMul(S32, NegY, Z);
2615   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2616 
2617   // Quotient/remainder estimate.
2618   auto Q = B.buildUMulH(S32, X, Z);
2619   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2620 
2621   // First quotient/remainder refinement.
2622   auto One = B.buildConstant(S32, 1);
2623   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2624   if (IsDiv)
2625     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2626   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2627 
2628   // Second quotient/remainder refinement.
2629   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2630   if (IsDiv)
2631     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2632   else
2633     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2634 }
2635 
2636 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2637                                               MachineRegisterInfo &MRI,
2638                                               MachineIRBuilder &B) const {
2639   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2640   Register DstReg = MI.getOperand(0).getReg();
2641   Register Num = MI.getOperand(1).getReg();
2642   Register Den = MI.getOperand(2).getReg();
2643   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2644   MI.eraseFromParent();
2645   return true;
2646 }
2647 
2648 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2649 //
2650 // Return lo, hi of result
2651 //
2652 // %cvt.lo = G_UITOFP Val.lo
2653 // %cvt.hi = G_UITOFP Val.hi
2654 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2655 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2656 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2657 // %mul2 = G_FMUL %mul1, 2**(-32)
2658 // %trunc = G_INTRINSIC_TRUNC %mul2
2659 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2660 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2661 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2662                                                        Register Val) {
2663   const LLT S32 = LLT::scalar(32);
2664   auto Unmerge = B.buildUnmerge(S32, Val);
2665 
2666   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2667   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2668 
2669   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2670                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2671 
2672   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2673   auto Mul1 =
2674       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2675 
2676   // 2**(-32)
2677   auto Mul2 =
2678       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2679   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2680 
2681   // -(2**32)
2682   auto Mad2 = B.buildFMAD(S32, Trunc,
2683                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2684 
2685   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2686   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2687 
2688   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2689 }
2690 
2691 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2692                                                   Register DstReg,
2693                                                   Register Numer,
2694                                                   Register Denom,
2695                                                   bool IsDiv) const {
2696   const LLT S32 = LLT::scalar(32);
2697   const LLT S64 = LLT::scalar(64);
2698   const LLT S1 = LLT::scalar(1);
2699   Register RcpLo, RcpHi;
2700 
2701   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2702 
2703   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2704 
2705   auto Zero64 = B.buildConstant(S64, 0);
2706   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2707 
2708   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2709   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2710 
2711   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2712   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2713   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2714 
2715   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2716   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2717   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2718   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2719 
2720   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2721   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2722   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2723   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2724   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2725 
2726   auto Zero32 = B.buildConstant(S32, 0);
2727   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2728   auto Add2_HiC =
2729       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2730   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2731   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2732 
2733   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2734   Register NumerLo = UnmergeNumer.getReg(0);
2735   Register NumerHi = UnmergeNumer.getReg(1);
2736 
2737   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2738   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2739   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2740   Register Mul3_Lo = UnmergeMul3.getReg(0);
2741   Register Mul3_Hi = UnmergeMul3.getReg(1);
2742   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2743   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2744   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2745   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2746 
2747   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2748   Register DenomLo = UnmergeDenom.getReg(0);
2749   Register DenomHi = UnmergeDenom.getReg(1);
2750 
2751   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2752   auto C1 = B.buildSExt(S32, CmpHi);
2753 
2754   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2755   auto C2 = B.buildSExt(S32, CmpLo);
2756 
2757   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2758   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2759 
2760   // TODO: Here and below portions of the code can be enclosed into if/endif.
2761   // Currently control flow is unconditional and we have 4 selects after
2762   // potential endif to substitute PHIs.
2763 
2764   // if C3 != 0 ...
2765   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2766   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2767   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2768   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2769 
2770   auto One64 = B.buildConstant(S64, 1);
2771   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2772 
2773   auto C4 =
2774       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2775   auto C5 =
2776       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2777   auto C6 = B.buildSelect(
2778       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2779 
2780   // if (C6 != 0)
2781   auto Add4 = B.buildAdd(S64, Add3, One64);
2782   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2783 
2784   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2785   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2786   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2787 
2788   // endif C6
2789   // endif C3
2790 
2791   if (IsDiv) {
2792     auto Sel1 = B.buildSelect(
2793         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2794     B.buildSelect(DstReg,
2795                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2796   } else {
2797     auto Sel2 = B.buildSelect(
2798         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2799     B.buildSelect(DstReg,
2800                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2801   }
2802 }
2803 
2804 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2805                                             MachineRegisterInfo &MRI,
2806                                             MachineIRBuilder &B) const {
2807   const LLT S64 = LLT::scalar(64);
2808   const LLT S32 = LLT::scalar(32);
2809   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2810   Register DstReg = MI.getOperand(0).getReg();
2811   Register Num = MI.getOperand(1).getReg();
2812   Register Den = MI.getOperand(2).getReg();
2813   LLT Ty = MRI.getType(DstReg);
2814 
2815   if (Ty == S32)
2816     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2817   else if (Ty == S64)
2818     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2819   else
2820     return false;
2821 
2822   MI.eraseFromParent();
2823   return true;
2824 
2825 }
2826 
2827 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2828                                             MachineRegisterInfo &MRI,
2829                                             MachineIRBuilder &B) const {
2830   const LLT S64 = LLT::scalar(64);
2831   const LLT S32 = LLT::scalar(32);
2832 
2833   Register DstReg = MI.getOperand(0).getReg();
2834   const LLT Ty = MRI.getType(DstReg);
2835   if (Ty != S32 && Ty != S64)
2836     return false;
2837 
2838   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2839 
2840   Register LHS = MI.getOperand(1).getReg();
2841   Register RHS = MI.getOperand(2).getReg();
2842 
2843   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2844   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2845   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2846 
2847   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2848   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2849 
2850   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2851   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2852 
2853   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2854   if (Ty == S32)
2855     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2856   else
2857     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2858 
2859   Register Sign;
2860   if (IsDiv)
2861     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2862   else
2863     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2864 
2865   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2866   B.buildSub(DstReg, UDivRem, Sign);
2867 
2868   MI.eraseFromParent();
2869   return true;
2870 }
2871 
2872 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2873                                                  MachineRegisterInfo &MRI,
2874                                                  MachineIRBuilder &B) const {
2875   Register Res = MI.getOperand(0).getReg();
2876   Register LHS = MI.getOperand(1).getReg();
2877   Register RHS = MI.getOperand(2).getReg();
2878 
2879   uint16_t Flags = MI.getFlags();
2880 
2881   LLT ResTy = MRI.getType(Res);
2882   LLT S32 = LLT::scalar(32);
2883   LLT S64 = LLT::scalar(64);
2884 
2885   const MachineFunction &MF = B.getMF();
2886   bool Unsafe =
2887     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2888 
2889   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2890     return false;
2891 
2892   if (!Unsafe && ResTy == S32 &&
2893       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2894     return false;
2895 
2896   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2897     // 1 / x -> RCP(x)
2898     if (CLHS->isExactlyValue(1.0)) {
2899       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2900         .addUse(RHS)
2901         .setMIFlags(Flags);
2902 
2903       MI.eraseFromParent();
2904       return true;
2905     }
2906 
2907     // -1 / x -> RCP( FNEG(x) )
2908     if (CLHS->isExactlyValue(-1.0)) {
2909       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2910       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2911         .addUse(FNeg.getReg(0))
2912         .setMIFlags(Flags);
2913 
2914       MI.eraseFromParent();
2915       return true;
2916     }
2917   }
2918 
2919   // x / y -> x * (1.0 / y)
2920   if (Unsafe) {
2921     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2922       .addUse(RHS)
2923       .setMIFlags(Flags);
2924     B.buildFMul(Res, LHS, RCP, Flags);
2925 
2926     MI.eraseFromParent();
2927     return true;
2928   }
2929 
2930   return false;
2931 }
2932 
2933 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2934                                          MachineRegisterInfo &MRI,
2935                                          MachineIRBuilder &B) const {
2936   Register Res = MI.getOperand(0).getReg();
2937   Register LHS = MI.getOperand(1).getReg();
2938   Register RHS = MI.getOperand(2).getReg();
2939 
2940   uint16_t Flags = MI.getFlags();
2941 
2942   LLT S16 = LLT::scalar(16);
2943   LLT S32 = LLT::scalar(32);
2944 
2945   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2946   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2947 
2948   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2949     .addUse(RHSExt.getReg(0))
2950     .setMIFlags(Flags);
2951 
2952   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2953   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2954 
2955   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2956     .addUse(RDst.getReg(0))
2957     .addUse(RHS)
2958     .addUse(LHS)
2959     .setMIFlags(Flags);
2960 
2961   MI.eraseFromParent();
2962   return true;
2963 }
2964 
2965 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2966 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2967 static void toggleSPDenormMode(bool Enable,
2968                                MachineIRBuilder &B,
2969                                const GCNSubtarget &ST,
2970                                AMDGPU::SIModeRegisterDefaults Mode) {
2971   // Set SP denorm mode to this value.
2972   unsigned SPDenormMode =
2973     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2974 
2975   if (ST.hasDenormModeInst()) {
2976     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2977     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2978 
2979     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2980     B.buildInstr(AMDGPU::S_DENORM_MODE)
2981       .addImm(NewDenormModeValue);
2982 
2983   } else {
2984     // Select FP32 bit field in mode register.
2985     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2986                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2987                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2988 
2989     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2990       .addImm(SPDenormMode)
2991       .addImm(SPDenormModeBitField);
2992   }
2993 }
2994 
2995 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2996                                          MachineRegisterInfo &MRI,
2997                                          MachineIRBuilder &B) const {
2998   Register Res = MI.getOperand(0).getReg();
2999   Register LHS = MI.getOperand(1).getReg();
3000   Register RHS = MI.getOperand(2).getReg();
3001   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3002   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3003 
3004   uint16_t Flags = MI.getFlags();
3005 
3006   LLT S32 = LLT::scalar(32);
3007   LLT S1 = LLT::scalar(1);
3008 
3009   auto One = B.buildFConstant(S32, 1.0f);
3010 
3011   auto DenominatorScaled =
3012     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3013       .addUse(LHS)
3014       .addUse(RHS)
3015       .addImm(0)
3016       .setMIFlags(Flags);
3017   auto NumeratorScaled =
3018     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3019       .addUse(LHS)
3020       .addUse(RHS)
3021       .addImm(1)
3022       .setMIFlags(Flags);
3023 
3024   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3025     .addUse(DenominatorScaled.getReg(0))
3026     .setMIFlags(Flags);
3027   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3028 
3029   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3030   // aren't modeled as reading it.
3031   if (!Mode.allFP32Denormals())
3032     toggleSPDenormMode(true, B, ST, Mode);
3033 
3034   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3035   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3036   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3037   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3038   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3039   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3040 
3041   if (!Mode.allFP32Denormals())
3042     toggleSPDenormMode(false, B, ST, Mode);
3043 
3044   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3045     .addUse(Fma4.getReg(0))
3046     .addUse(Fma1.getReg(0))
3047     .addUse(Fma3.getReg(0))
3048     .addUse(NumeratorScaled.getReg(1))
3049     .setMIFlags(Flags);
3050 
3051   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3052     .addUse(Fmas.getReg(0))
3053     .addUse(RHS)
3054     .addUse(LHS)
3055     .setMIFlags(Flags);
3056 
3057   MI.eraseFromParent();
3058   return true;
3059 }
3060 
3061 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3062                                          MachineRegisterInfo &MRI,
3063                                          MachineIRBuilder &B) const {
3064   Register Res = MI.getOperand(0).getReg();
3065   Register LHS = MI.getOperand(1).getReg();
3066   Register RHS = MI.getOperand(2).getReg();
3067 
3068   uint16_t Flags = MI.getFlags();
3069 
3070   LLT S64 = LLT::scalar(64);
3071   LLT S1 = LLT::scalar(1);
3072 
3073   auto One = B.buildFConstant(S64, 1.0);
3074 
3075   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3076     .addUse(LHS)
3077     .addUse(RHS)
3078     .addImm(0)
3079     .setMIFlags(Flags);
3080 
3081   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3082 
3083   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3084     .addUse(DivScale0.getReg(0))
3085     .setMIFlags(Flags);
3086 
3087   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3088   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3089   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3090 
3091   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3092     .addUse(LHS)
3093     .addUse(RHS)
3094     .addImm(1)
3095     .setMIFlags(Flags);
3096 
3097   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3098   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3099   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3100 
3101   Register Scale;
3102   if (!ST.hasUsableDivScaleConditionOutput()) {
3103     // Workaround a hardware bug on SI where the condition output from div_scale
3104     // is not usable.
3105 
3106     LLT S32 = LLT::scalar(32);
3107 
3108     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3109     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3110     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3111     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3112 
3113     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3114                               Scale1Unmerge.getReg(1));
3115     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3116                               Scale0Unmerge.getReg(1));
3117     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3118   } else {
3119     Scale = DivScale1.getReg(1);
3120   }
3121 
3122   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3123     .addUse(Fma4.getReg(0))
3124     .addUse(Fma3.getReg(0))
3125     .addUse(Mul.getReg(0))
3126     .addUse(Scale)
3127     .setMIFlags(Flags);
3128 
3129   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3130     .addUse(Fmas.getReg(0))
3131     .addUse(RHS)
3132     .addUse(LHS)
3133     .setMIFlags(Flags);
3134 
3135   MI.eraseFromParent();
3136   return true;
3137 }
3138 
3139 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3140                                                  MachineRegisterInfo &MRI,
3141                                                  MachineIRBuilder &B) const {
3142   Register Res = MI.getOperand(0).getReg();
3143   Register LHS = MI.getOperand(2).getReg();
3144   Register RHS = MI.getOperand(3).getReg();
3145   uint16_t Flags = MI.getFlags();
3146 
3147   LLT S32 = LLT::scalar(32);
3148   LLT S1 = LLT::scalar(1);
3149 
3150   auto Abs = B.buildFAbs(S32, RHS, Flags);
3151   const APFloat C0Val(1.0f);
3152 
3153   auto C0 = B.buildConstant(S32, 0x6f800000);
3154   auto C1 = B.buildConstant(S32, 0x2f800000);
3155   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3156 
3157   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3158   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3159 
3160   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3161 
3162   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3163     .addUse(Mul0.getReg(0))
3164     .setMIFlags(Flags);
3165 
3166   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3167 
3168   B.buildFMul(Res, Sel, Mul1, Flags);
3169 
3170   MI.eraseFromParent();
3171   return true;
3172 }
3173 
3174 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3175 // FIXME: Why do we handle this one but not other removed instructions?
3176 //
3177 // Reciprocal square root.  The clamp prevents infinite results, clamping
3178 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3179 // +-max_float.
3180 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3181                                                     MachineRegisterInfo &MRI,
3182                                                     MachineIRBuilder &B) const {
3183   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3184     return true;
3185 
3186   Register Dst = MI.getOperand(0).getReg();
3187   Register Src = MI.getOperand(2).getReg();
3188   auto Flags = MI.getFlags();
3189 
3190   LLT Ty = MRI.getType(Dst);
3191 
3192   const fltSemantics *FltSemantics;
3193   if (Ty == LLT::scalar(32))
3194     FltSemantics = &APFloat::IEEEsingle();
3195   else if (Ty == LLT::scalar(64))
3196     FltSemantics = &APFloat::IEEEdouble();
3197   else
3198     return false;
3199 
3200   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3201     .addUse(Src)
3202     .setMIFlags(Flags);
3203 
3204   // We don't need to concern ourselves with the snan handling difference, since
3205   // the rsq quieted (or not) so use the one which will directly select.
3206   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3207   const bool UseIEEE = MFI->getMode().IEEE;
3208 
3209   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3210   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3211                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3212 
3213   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3214 
3215   if (UseIEEE)
3216     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3217   else
3218     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3219   MI.eraseFromParent();
3220   return true;
3221 }
3222 
3223 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3224   switch (IID) {
3225   case Intrinsic::amdgcn_ds_fadd:
3226     return AMDGPU::G_ATOMICRMW_FADD;
3227   case Intrinsic::amdgcn_ds_fmin:
3228     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3229   case Intrinsic::amdgcn_ds_fmax:
3230     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3231   default:
3232     llvm_unreachable("not a DS FP intrinsic");
3233   }
3234 }
3235 
3236 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3237                                                       MachineInstr &MI,
3238                                                       Intrinsic::ID IID) const {
3239   GISelChangeObserver &Observer = Helper.Observer;
3240   Observer.changingInstr(MI);
3241 
3242   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3243 
3244   // The remaining operands were used to set fields in the MemOperand on
3245   // construction.
3246   for (int I = 6; I > 3; --I)
3247     MI.RemoveOperand(I);
3248 
3249   MI.RemoveOperand(1); // Remove the intrinsic ID.
3250   Observer.changedInstr(MI);
3251   return true;
3252 }
3253 
3254 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3255                                             MachineRegisterInfo &MRI,
3256                                             MachineIRBuilder &B) const {
3257   uint64_t Offset =
3258     ST.getTargetLowering()->getImplicitParameterOffset(
3259       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3260   LLT DstTy = MRI.getType(DstReg);
3261   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3262 
3263   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3264   if (!loadInputValue(KernargPtrReg, B,
3265                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3266     return false;
3267 
3268   // FIXME: This should be nuw
3269   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3270   return true;
3271 }
3272 
3273 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3274                                                  MachineRegisterInfo &MRI,
3275                                                  MachineIRBuilder &B) const {
3276   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3277   if (!MFI->isEntryFunction()) {
3278     return legalizePreloadedArgIntrin(MI, MRI, B,
3279                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3280   }
3281 
3282   Register DstReg = MI.getOperand(0).getReg();
3283   if (!getImplicitArgPtr(DstReg, MRI, B))
3284     return false;
3285 
3286   MI.eraseFromParent();
3287   return true;
3288 }
3289 
3290 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3291                                               MachineRegisterInfo &MRI,
3292                                               MachineIRBuilder &B,
3293                                               unsigned AddrSpace) const {
3294   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3295   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3296   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3297   MI.eraseFromParent();
3298   return true;
3299 }
3300 
3301 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3302 // offset (the offset that is included in bounds checking and swizzling, to be
3303 // split between the instruction's voffset and immoffset fields) and soffset
3304 // (the offset that is excluded from bounds checking and swizzling, to go in
3305 // the instruction's soffset field).  This function takes the first kind of
3306 // offset and figures out how to split it between voffset and immoffset.
3307 std::tuple<Register, unsigned, unsigned>
3308 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3309                                         Register OrigOffset) const {
3310   const unsigned MaxImm = 4095;
3311   Register BaseReg;
3312   unsigned TotalConstOffset;
3313   MachineInstr *OffsetDef;
3314   const LLT S32 = LLT::scalar(32);
3315 
3316   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3317     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3318 
3319   unsigned ImmOffset = TotalConstOffset;
3320 
3321   // If the immediate value is too big for the immoffset field, put the value
3322   // and -4096 into the immoffset field so that the value that is copied/added
3323   // for the voffset field is a multiple of 4096, and it stands more chance
3324   // of being CSEd with the copy/add for another similar load/store.
3325   // However, do not do that rounding down to a multiple of 4096 if that is a
3326   // negative number, as it appears to be illegal to have a negative offset
3327   // in the vgpr, even if adding the immediate offset makes it positive.
3328   unsigned Overflow = ImmOffset & ~MaxImm;
3329   ImmOffset -= Overflow;
3330   if ((int32_t)Overflow < 0) {
3331     Overflow += ImmOffset;
3332     ImmOffset = 0;
3333   }
3334 
3335   if (Overflow != 0) {
3336     if (!BaseReg) {
3337       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3338     } else {
3339       auto OverflowVal = B.buildConstant(S32, Overflow);
3340       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3341     }
3342   }
3343 
3344   if (!BaseReg)
3345     BaseReg = B.buildConstant(S32, 0).getReg(0);
3346 
3347   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3348 }
3349 
3350 /// Handle register layout difference for f16 images for some subtargets.
3351 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3352                                              MachineRegisterInfo &MRI,
3353                                              Register Reg) const {
3354   if (!ST.hasUnpackedD16VMem())
3355     return Reg;
3356 
3357   const LLT S16 = LLT::scalar(16);
3358   const LLT S32 = LLT::scalar(32);
3359   LLT StoreVT = MRI.getType(Reg);
3360   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3361 
3362   auto Unmerge = B.buildUnmerge(S16, Reg);
3363 
3364   SmallVector<Register, 4> WideRegs;
3365   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3366     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3367 
3368   int NumElts = StoreVT.getNumElements();
3369 
3370   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3371 }
3372 
3373 Register AMDGPULegalizerInfo::fixStoreSourceType(
3374   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3375   MachineRegisterInfo *MRI = B.getMRI();
3376   LLT Ty = MRI->getType(VData);
3377 
3378   const LLT S16 = LLT::scalar(16);
3379 
3380   // Fixup illegal register types for i8 stores.
3381   if (Ty == LLT::scalar(8) || Ty == S16) {
3382     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3383     return AnyExt;
3384   }
3385 
3386   if (Ty.isVector()) {
3387     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3388       if (IsFormat)
3389         return handleD16VData(B, *MRI, VData);
3390     }
3391   }
3392 
3393   return VData;
3394 }
3395 
3396 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3397                                               MachineRegisterInfo &MRI,
3398                                               MachineIRBuilder &B,
3399                                               bool IsTyped,
3400                                               bool IsFormat) const {
3401   Register VData = MI.getOperand(1).getReg();
3402   LLT Ty = MRI.getType(VData);
3403   LLT EltTy = Ty.getScalarType();
3404   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3405   const LLT S32 = LLT::scalar(32);
3406 
3407   VData = fixStoreSourceType(B, VData, IsFormat);
3408   Register RSrc = MI.getOperand(2).getReg();
3409 
3410   MachineMemOperand *MMO = *MI.memoperands_begin();
3411   const int MemSize = MMO->getSize();
3412 
3413   unsigned ImmOffset;
3414   unsigned TotalOffset;
3415 
3416   // The typed intrinsics add an immediate after the registers.
3417   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3418 
3419   // The struct intrinsic variants add one additional operand over raw.
3420   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3421   Register VIndex;
3422   int OpOffset = 0;
3423   if (HasVIndex) {
3424     VIndex = MI.getOperand(3).getReg();
3425     OpOffset = 1;
3426   }
3427 
3428   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3429   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3430 
3431   unsigned Format = 0;
3432   if (IsTyped) {
3433     Format = MI.getOperand(5 + OpOffset).getImm();
3434     ++OpOffset;
3435   }
3436 
3437   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3438 
3439   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3440   if (TotalOffset != 0)
3441     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3442 
3443   unsigned Opc;
3444   if (IsTyped) {
3445     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3446                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3447   } else if (IsFormat) {
3448     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3449                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3450   } else {
3451     switch (MemSize) {
3452     case 1:
3453       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3454       break;
3455     case 2:
3456       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3457       break;
3458     default:
3459       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3460       break;
3461     }
3462   }
3463 
3464   if (!VIndex)
3465     VIndex = B.buildConstant(S32, 0).getReg(0);
3466 
3467   auto MIB = B.buildInstr(Opc)
3468     .addUse(VData)              // vdata
3469     .addUse(RSrc)               // rsrc
3470     .addUse(VIndex)             // vindex
3471     .addUse(VOffset)            // voffset
3472     .addUse(SOffset)            // soffset
3473     .addImm(ImmOffset);         // offset(imm)
3474 
3475   if (IsTyped)
3476     MIB.addImm(Format);
3477 
3478   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3479      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3480      .addMemOperand(MMO);
3481 
3482   MI.eraseFromParent();
3483   return true;
3484 }
3485 
3486 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3487                                              MachineRegisterInfo &MRI,
3488                                              MachineIRBuilder &B,
3489                                              bool IsFormat,
3490                                              bool IsTyped) const {
3491   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3492   MachineMemOperand *MMO = *MI.memoperands_begin();
3493   const int MemSize = MMO->getSize();
3494   const LLT S32 = LLT::scalar(32);
3495 
3496   Register Dst = MI.getOperand(0).getReg();
3497   Register RSrc = MI.getOperand(2).getReg();
3498 
3499   // The typed intrinsics add an immediate after the registers.
3500   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3501 
3502   // The struct intrinsic variants add one additional operand over raw.
3503   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3504   Register VIndex;
3505   int OpOffset = 0;
3506   if (HasVIndex) {
3507     VIndex = MI.getOperand(3).getReg();
3508     OpOffset = 1;
3509   }
3510 
3511   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3512   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3513 
3514   unsigned Format = 0;
3515   if (IsTyped) {
3516     Format = MI.getOperand(5 + OpOffset).getImm();
3517     ++OpOffset;
3518   }
3519 
3520   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3521   unsigned ImmOffset;
3522   unsigned TotalOffset;
3523 
3524   LLT Ty = MRI.getType(Dst);
3525   LLT EltTy = Ty.getScalarType();
3526   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3527   const bool Unpacked = ST.hasUnpackedD16VMem();
3528 
3529   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3530   if (TotalOffset != 0)
3531     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3532 
3533   unsigned Opc;
3534 
3535   if (IsTyped) {
3536     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3537                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3538   } else if (IsFormat) {
3539     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3540                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3541   } else {
3542     switch (MemSize) {
3543     case 1:
3544       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3545       break;
3546     case 2:
3547       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3548       break;
3549     default:
3550       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3551       break;
3552     }
3553   }
3554 
3555   Register LoadDstReg;
3556 
3557   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3558   LLT UnpackedTy = Ty.changeElementSize(32);
3559 
3560   if (IsExtLoad)
3561     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3562   else if (Unpacked && IsD16 && Ty.isVector())
3563     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3564   else
3565     LoadDstReg = Dst;
3566 
3567   if (!VIndex)
3568     VIndex = B.buildConstant(S32, 0).getReg(0);
3569 
3570   auto MIB = B.buildInstr(Opc)
3571     .addDef(LoadDstReg)         // vdata
3572     .addUse(RSrc)               // rsrc
3573     .addUse(VIndex)             // vindex
3574     .addUse(VOffset)            // voffset
3575     .addUse(SOffset)            // soffset
3576     .addImm(ImmOffset);         // offset(imm)
3577 
3578   if (IsTyped)
3579     MIB.addImm(Format);
3580 
3581   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3582      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3583      .addMemOperand(MMO);
3584 
3585   if (LoadDstReg != Dst) {
3586     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3587 
3588     // Widen result for extending loads was widened.
3589     if (IsExtLoad)
3590       B.buildTrunc(Dst, LoadDstReg);
3591     else {
3592       // Repack to original 16-bit vector result
3593       // FIXME: G_TRUNC should work, but legalization currently fails
3594       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3595       SmallVector<Register, 4> Repack;
3596       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3597         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3598       B.buildMerge(Dst, Repack);
3599     }
3600   }
3601 
3602   MI.eraseFromParent();
3603   return true;
3604 }
3605 
3606 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3607                                                MachineIRBuilder &B,
3608                                                bool IsInc) const {
3609   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3610                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3611   B.buildInstr(Opc)
3612     .addDef(MI.getOperand(0).getReg())
3613     .addUse(MI.getOperand(2).getReg())
3614     .addUse(MI.getOperand(3).getReg())
3615     .cloneMemRefs(MI);
3616   MI.eraseFromParent();
3617   return true;
3618 }
3619 
3620 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3621   switch (IntrID) {
3622   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3623   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3624     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3625   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3626   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3627     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3628   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3629   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3630     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3631   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3632   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3633     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3634   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3635   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3636     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3637   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3638   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3639     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3640   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3641   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3642     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3643   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3644   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3645     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3646   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3647   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3648     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3649   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3650   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3651     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3652   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3653   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3654     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3655   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3656   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3657     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3658   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3659   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3660     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3661   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3662   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3663     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3664   default:
3665     llvm_unreachable("unhandled atomic opcode");
3666   }
3667 }
3668 
3669 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3670                                                MachineIRBuilder &B,
3671                                                Intrinsic::ID IID) const {
3672   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3673                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3674   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3675 
3676   Register Dst;
3677 
3678   int OpOffset = 0;
3679   if (HasReturn) {
3680     // A few FP atomics do not support return values.
3681     Dst = MI.getOperand(0).getReg();
3682   } else {
3683     OpOffset = -1;
3684   }
3685 
3686   Register VData = MI.getOperand(2 + OpOffset).getReg();
3687   Register CmpVal;
3688 
3689   if (IsCmpSwap) {
3690     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3691     ++OpOffset;
3692   }
3693 
3694   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3695   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3696 
3697   // The struct intrinsic variants add one additional operand over raw.
3698   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3699   Register VIndex;
3700   if (HasVIndex) {
3701     VIndex = MI.getOperand(4 + OpOffset).getReg();
3702     ++OpOffset;
3703   }
3704 
3705   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3706   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3707   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3708 
3709   MachineMemOperand *MMO = *MI.memoperands_begin();
3710 
3711   unsigned ImmOffset;
3712   unsigned TotalOffset;
3713   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3714   if (TotalOffset != 0)
3715     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3716 
3717   if (!VIndex)
3718     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3719 
3720   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3721 
3722   if (HasReturn)
3723     MIB.addDef(Dst);
3724 
3725   MIB.addUse(VData); // vdata
3726 
3727   if (IsCmpSwap)
3728     MIB.addReg(CmpVal);
3729 
3730   MIB.addUse(RSrc)               // rsrc
3731      .addUse(VIndex)             // vindex
3732      .addUse(VOffset)            // voffset
3733      .addUse(SOffset)            // soffset
3734      .addImm(ImmOffset)          // offset(imm)
3735      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3736      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3737      .addMemOperand(MMO);
3738 
3739   MI.eraseFromParent();
3740   return true;
3741 }
3742 
3743 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3744 /// vector with s16 typed elements.
3745 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3746                                         SmallVectorImpl<Register> &PackedAddrs,
3747                                         int AddrIdx, int DimIdx, int EndIdx,
3748                                         int NumGradients) {
3749   const LLT S16 = LLT::scalar(16);
3750   const LLT V2S16 = LLT::vector(2, 16);
3751 
3752   for (int I = AddrIdx; I < EndIdx; ++I) {
3753     MachineOperand &SrcOp = MI.getOperand(I);
3754     if (!SrcOp.isReg())
3755       continue; // _L to _LZ may have eliminated this.
3756 
3757     Register AddrReg = SrcOp.getReg();
3758 
3759     if (I < DimIdx) {
3760       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3761       PackedAddrs.push_back(AddrReg);
3762     } else {
3763       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3764       // derivatives dx/dh and dx/dv are packed with undef.
3765       if (((I + 1) >= EndIdx) ||
3766           ((NumGradients / 2) % 2 == 1 &&
3767            (I == DimIdx + (NumGradients / 2) - 1 ||
3768             I == DimIdx + NumGradients - 1)) ||
3769           // Check for _L to _LZ optimization
3770           !MI.getOperand(I + 1).isReg()) {
3771         PackedAddrs.push_back(
3772             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3773                 .getReg(0));
3774       } else {
3775         PackedAddrs.push_back(
3776             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3777                 .getReg(0));
3778         ++I;
3779       }
3780     }
3781   }
3782 }
3783 
3784 /// Convert from separate vaddr components to a single vector address register,
3785 /// and replace the remaining operands with $noreg.
3786 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3787                                      int DimIdx, int NumVAddrs) {
3788   const LLT S32 = LLT::scalar(32);
3789 
3790   SmallVector<Register, 8> AddrRegs;
3791   for (int I = 0; I != NumVAddrs; ++I) {
3792     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3793     if (SrcOp.isReg()) {
3794       AddrRegs.push_back(SrcOp.getReg());
3795       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3796     }
3797   }
3798 
3799   int NumAddrRegs = AddrRegs.size();
3800   if (NumAddrRegs != 1) {
3801     // Round up to 8 elements for v5-v7
3802     // FIXME: Missing intermediate sized register classes and instructions.
3803     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3804       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3805       auto Undef = B.buildUndef(S32);
3806       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3807       NumAddrRegs = RoundedNumRegs;
3808     }
3809 
3810     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3811     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3812   }
3813 
3814   for (int I = 1; I != NumVAddrs; ++I) {
3815     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3816     if (SrcOp.isReg())
3817       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3818   }
3819 }
3820 
3821 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3822 ///
3823 /// Depending on the subtarget, load/store with 16-bit element data need to be
3824 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3825 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3826 /// registers.
3827 ///
3828 /// We don't want to directly select image instructions just yet, but also want
3829 /// to exposes all register repacking to the legalizer/combiners. We also don't
3830 /// want a selected instrution entering RegBankSelect. In order to avoid
3831 /// defining a multitude of intermediate image instructions, directly hack on
3832 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3833 /// now unnecessary arguments with $noreg.
3834 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3835     MachineInstr &MI, MachineIRBuilder &B,
3836     GISelChangeObserver &Observer,
3837     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3838 
3839   const int NumDefs = MI.getNumExplicitDefs();
3840   bool IsTFE = NumDefs == 2;
3841   // We are only processing the operands of d16 image operations on subtargets
3842   // that use the unpacked register layout, or need to repack the TFE result.
3843 
3844   // TODO: Do we need to guard against already legalized intrinsics?
3845   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3846     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3847 
3848   MachineRegisterInfo *MRI = B.getMRI();
3849   const LLT S32 = LLT::scalar(32);
3850   const LLT S16 = LLT::scalar(16);
3851   const LLT V2S16 = LLT::vector(2, 16);
3852 
3853   // Index of first address argument
3854   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3855 
3856   int NumVAddrs, NumGradients;
3857   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3858   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3859     getDMaskIdx(BaseOpcode, NumDefs);
3860   unsigned DMask = 0;
3861 
3862   // Check for 16 bit addresses and pack if true.
3863   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3864   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3865   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3866   const bool IsG16 = GradTy == S16;
3867   const bool IsA16 = AddrTy == S16;
3868 
3869   int DMaskLanes = 0;
3870   if (!BaseOpcode->Atomic) {
3871     DMask = MI.getOperand(DMaskIdx).getImm();
3872     if (BaseOpcode->Gather4) {
3873       DMaskLanes = 4;
3874     } else if (DMask != 0) {
3875       DMaskLanes = countPopulation(DMask);
3876     } else if (!IsTFE && !BaseOpcode->Store) {
3877       // If dmask is 0, this is a no-op load. This can be eliminated.
3878       B.buildUndef(MI.getOperand(0));
3879       MI.eraseFromParent();
3880       return true;
3881     }
3882   }
3883 
3884   Observer.changingInstr(MI);
3885   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3886 
3887   unsigned NewOpcode = NumDefs == 0 ?
3888     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3889 
3890   // Track that we legalized this
3891   MI.setDesc(B.getTII().get(NewOpcode));
3892 
3893   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3894   // dmask to be at least 1 otherwise the instruction will fail
3895   if (IsTFE && DMask == 0) {
3896     DMask = 0x1;
3897     DMaskLanes = 1;
3898     MI.getOperand(DMaskIdx).setImm(DMask);
3899   }
3900 
3901   if (BaseOpcode->Atomic) {
3902     Register VData0 = MI.getOperand(2).getReg();
3903     LLT Ty = MRI->getType(VData0);
3904 
3905     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3906     if (Ty.isVector())
3907       return false;
3908 
3909     if (BaseOpcode->AtomicX2) {
3910       Register VData1 = MI.getOperand(3).getReg();
3911       // The two values are packed in one register.
3912       LLT PackedTy = LLT::vector(2, Ty);
3913       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3914       MI.getOperand(2).setReg(Concat.getReg(0));
3915       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3916     }
3917   }
3918 
3919   int CorrectedNumVAddrs = NumVAddrs;
3920 
3921   // Optimize _L to _LZ when _L is zero
3922   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3923         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3924     const ConstantFP *ConstantLod;
3925     const int LodIdx = AddrIdx + NumVAddrs - 1;
3926 
3927     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3928       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3929         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3930         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3931           LZMappingInfo->LZ, ImageDimIntr->Dim);
3932 
3933         // The starting indexes should remain in the same place.
3934         --NumVAddrs;
3935         --CorrectedNumVAddrs;
3936 
3937         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3938           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3939         MI.RemoveOperand(LodIdx);
3940       }
3941     }
3942   }
3943 
3944   // Optimize _mip away, when 'lod' is zero
3945   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3946     int64_t ConstantLod;
3947     const int LodIdx = AddrIdx + NumVAddrs - 1;
3948 
3949     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3950       if (ConstantLod == 0) {
3951         // TODO: Change intrinsic opcode and remove operand instead or replacing
3952         // it with 0, as the _L to _LZ handling is done above.
3953         MI.getOperand(LodIdx).ChangeToImmediate(0);
3954         --CorrectedNumVAddrs;
3955       }
3956     }
3957   }
3958 
3959   // Rewrite the addressing register layout before doing anything else.
3960   if (IsA16 || IsG16) {
3961     if (IsA16) {
3962       // Target must support the feature and gradients need to be 16 bit too
3963       if (!ST.hasA16() || !IsG16)
3964         return false;
3965     } else if (!ST.hasG16())
3966       return false;
3967 
3968     if (NumVAddrs > 1) {
3969       SmallVector<Register, 4> PackedRegs;
3970       // Don't compress addresses for G16
3971       const int PackEndIdx =
3972           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3973       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3974                                   PackEndIdx, NumGradients);
3975 
3976       if (!IsA16) {
3977         // Add uncompressed address
3978         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3979           int AddrReg = MI.getOperand(I).getReg();
3980           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3981           PackedRegs.push_back(AddrReg);
3982         }
3983       }
3984 
3985       // See also below in the non-a16 branch
3986       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3987 
3988       if (!UseNSA && PackedRegs.size() > 1) {
3989         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3990         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3991         PackedRegs[0] = Concat.getReg(0);
3992         PackedRegs.resize(1);
3993       }
3994 
3995       const int NumPacked = PackedRegs.size();
3996       for (int I = 0; I != NumVAddrs; ++I) {
3997         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3998         if (!SrcOp.isReg()) {
3999           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4000           continue;
4001         }
4002 
4003         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4004 
4005         if (I < NumPacked)
4006           SrcOp.setReg(PackedRegs[I]);
4007         else
4008           SrcOp.setReg(AMDGPU::NoRegister);
4009       }
4010     }
4011   } else {
4012     // If the register allocator cannot place the address registers contiguously
4013     // without introducing moves, then using the non-sequential address encoding
4014     // is always preferable, since it saves VALU instructions and is usually a
4015     // wash in terms of code size or even better.
4016     //
4017     // However, we currently have no way of hinting to the register allocator
4018     // that MIMG addresses should be placed contiguously when it is possible to
4019     // do so, so force non-NSA for the common 2-address case as a heuristic.
4020     //
4021     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4022     // allocation when possible.
4023     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4024 
4025     if (!UseNSA && NumVAddrs > 1)
4026       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4027   }
4028 
4029   int Flags = 0;
4030   if (IsA16)
4031     Flags |= 1;
4032   if (IsG16)
4033     Flags |= 2;
4034   MI.addOperand(MachineOperand::CreateImm(Flags));
4035 
4036   if (BaseOpcode->Store) { // No TFE for stores?
4037     // TODO: Handle dmask trim
4038     Register VData = MI.getOperand(1).getReg();
4039     LLT Ty = MRI->getType(VData);
4040     if (!Ty.isVector() || Ty.getElementType() != S16)
4041       return true;
4042 
4043     Register RepackedReg = handleD16VData(B, *MRI, VData);
4044     if (RepackedReg != VData) {
4045       MI.getOperand(1).setReg(RepackedReg);
4046     }
4047 
4048     return true;
4049   }
4050 
4051   Register DstReg = MI.getOperand(0).getReg();
4052   LLT Ty = MRI->getType(DstReg);
4053   const LLT EltTy = Ty.getScalarType();
4054   const bool IsD16 = Ty.getScalarType() == S16;
4055   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4056 
4057   // Confirm that the return type is large enough for the dmask specified
4058   if (NumElts < DMaskLanes)
4059     return false;
4060 
4061   if (NumElts > 4 || DMaskLanes > 4)
4062     return false;
4063 
4064   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4065   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4066 
4067   // The raw dword aligned data component of the load. The only legal cases
4068   // where this matters should be when using the packed D16 format, for
4069   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4070   LLT RoundedTy;
4071 
4072   // S32 vector to to cover all data, plus TFE result element.
4073   LLT TFETy;
4074 
4075   // Register type to use for each loaded component. Will be S32 or V2S16.
4076   LLT RegTy;
4077 
4078   if (IsD16 && ST.hasUnpackedD16VMem()) {
4079     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4080     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4081     RegTy = S32;
4082   } else {
4083     unsigned EltSize = EltTy.getSizeInBits();
4084     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4085     unsigned RoundedSize = 32 * RoundedElts;
4086     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4087     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4088     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4089   }
4090 
4091   // The return type does not need adjustment.
4092   // TODO: Should we change s16 case to s32 or <2 x s16>?
4093   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4094     return true;
4095 
4096   Register Dst1Reg;
4097 
4098   // Insert after the instruction.
4099   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4100 
4101   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4102   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4103   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4104   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4105 
4106   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4107 
4108   MI.getOperand(0).setReg(NewResultReg);
4109 
4110   // In the IR, TFE is supposed to be used with a 2 element struct return
4111   // type. The intruction really returns these two values in one contiguous
4112   // register, with one additional dword beyond the loaded data. Rewrite the
4113   // return type to use a single register result.
4114 
4115   if (IsTFE) {
4116     Dst1Reg = MI.getOperand(1).getReg();
4117     if (MRI->getType(Dst1Reg) != S32)
4118       return false;
4119 
4120     // TODO: Make sure the TFE operand bit is set.
4121     MI.RemoveOperand(1);
4122 
4123     // Handle the easy case that requires no repack instructions.
4124     if (Ty == S32) {
4125       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4126       return true;
4127     }
4128   }
4129 
4130   // Now figure out how to copy the new result register back into the old
4131   // result.
4132   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4133 
4134   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4135 
4136   if (ResultNumRegs == 1) {
4137     assert(!IsTFE);
4138     ResultRegs[0] = NewResultReg;
4139   } else {
4140     // We have to repack into a new vector of some kind.
4141     for (int I = 0; I != NumDataRegs; ++I)
4142       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4143     B.buildUnmerge(ResultRegs, NewResultReg);
4144 
4145     // Drop the final TFE element to get the data part. The TFE result is
4146     // directly written to the right place already.
4147     if (IsTFE)
4148       ResultRegs.resize(NumDataRegs);
4149   }
4150 
4151   // For an s16 scalar result, we form an s32 result with a truncate regardless
4152   // of packed vs. unpacked.
4153   if (IsD16 && !Ty.isVector()) {
4154     B.buildTrunc(DstReg, ResultRegs[0]);
4155     return true;
4156   }
4157 
4158   // Avoid a build/concat_vector of 1 entry.
4159   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4160     B.buildBitcast(DstReg, ResultRegs[0]);
4161     return true;
4162   }
4163 
4164   assert(Ty.isVector());
4165 
4166   if (IsD16) {
4167     // For packed D16 results with TFE enabled, all the data components are
4168     // S32. Cast back to the expected type.
4169     //
4170     // TODO: We don't really need to use load s32 elements. We would only need one
4171     // cast for the TFE result if a multiple of v2s16 was used.
4172     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4173       for (Register &Reg : ResultRegs)
4174         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4175     } else if (ST.hasUnpackedD16VMem()) {
4176       for (Register &Reg : ResultRegs)
4177         Reg = B.buildTrunc(S16, Reg).getReg(0);
4178     }
4179   }
4180 
4181   auto padWithUndef = [&](LLT Ty, int NumElts) {
4182     if (NumElts == 0)
4183       return;
4184     Register Undef = B.buildUndef(Ty).getReg(0);
4185     for (int I = 0; I != NumElts; ++I)
4186       ResultRegs.push_back(Undef);
4187   };
4188 
4189   // Pad out any elements eliminated due to the dmask.
4190   LLT ResTy = MRI->getType(ResultRegs[0]);
4191   if (!ResTy.isVector()) {
4192     padWithUndef(ResTy, NumElts - ResultRegs.size());
4193     B.buildBuildVector(DstReg, ResultRegs);
4194     return true;
4195   }
4196 
4197   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4198   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4199 
4200   // Deal with the one annoying legal case.
4201   const LLT V3S16 = LLT::vector(3, 16);
4202   if (Ty == V3S16) {
4203     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4204     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4205     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4206     return true;
4207   }
4208 
4209   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4210   B.buildConcatVectors(DstReg, ResultRegs);
4211   return true;
4212 }
4213 
4214 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4215   LegalizerHelper &Helper, MachineInstr &MI) const {
4216   MachineIRBuilder &B = Helper.MIRBuilder;
4217   GISelChangeObserver &Observer = Helper.Observer;
4218 
4219   Register Dst = MI.getOperand(0).getReg();
4220   LLT Ty = B.getMRI()->getType(Dst);
4221   unsigned Size = Ty.getSizeInBits();
4222   MachineFunction &MF = B.getMF();
4223 
4224   Observer.changingInstr(MI);
4225 
4226   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4227     Ty = getBitcastRegisterType(Ty);
4228     Helper.bitcastDst(MI, Ty, 0);
4229     Dst = MI.getOperand(0).getReg();
4230     B.setInsertPt(B.getMBB(), MI);
4231   }
4232 
4233   // FIXME: We don't really need this intermediate instruction. The intrinsic
4234   // should be fixed to have a memory operand. Since it's readnone, we're not
4235   // allowed to add one.
4236   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4237   MI.RemoveOperand(1); // Remove intrinsic ID
4238 
4239   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4240   // TODO: Should this use datalayout alignment?
4241   const unsigned MemSize = (Size + 7) / 8;
4242   const Align MemAlign(4);
4243   MachineMemOperand *MMO = MF.getMachineMemOperand(
4244       MachinePointerInfo(),
4245       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4246           MachineMemOperand::MOInvariant,
4247       MemSize, MemAlign);
4248   MI.addMemOperand(MF, MMO);
4249 
4250   // There are no 96-bit result scalar loads, but widening to 128-bit should
4251   // always be legal. We may need to restore this to a 96-bit result if it turns
4252   // out this needs to be converted to a vector load during RegBankSelect.
4253   if (!isPowerOf2_32(Size)) {
4254     if (Ty.isVector())
4255       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4256     else
4257       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4258   }
4259 
4260   Observer.changedInstr(MI);
4261   return true;
4262 }
4263 
4264 // TODO: Move to selection
4265 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4266                                                 MachineRegisterInfo &MRI,
4267                                                 MachineIRBuilder &B) const {
4268   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4269   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4270       !ST.isTrapHandlerEnabled()) {
4271     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4272   } else {
4273     // Pass queue pointer to trap handler as input, and insert trap instruction
4274     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4275     MachineRegisterInfo &MRI = *B.getMRI();
4276 
4277     Register LiveIn =
4278       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4279     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4280       return false;
4281 
4282     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4283     B.buildCopy(SGPR01, LiveIn);
4284     B.buildInstr(AMDGPU::S_TRAP)
4285         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4286         .addReg(SGPR01, RegState::Implicit);
4287   }
4288 
4289   MI.eraseFromParent();
4290   return true;
4291 }
4292 
4293 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4294     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4295   // Is non-HSA path or trap-handler disabled? then, report a warning
4296   // accordingly
4297   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4298       !ST.isTrapHandlerEnabled()) {
4299     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4300                                      "debugtrap handler not supported",
4301                                      MI.getDebugLoc(), DS_Warning);
4302     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4303     Ctx.diagnose(NoTrap);
4304   } else {
4305     // Insert debug-trap instruction
4306     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4307   }
4308 
4309   MI.eraseFromParent();
4310   return true;
4311 }
4312 
4313 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4314                                             MachineInstr &MI) const {
4315   MachineIRBuilder &B = Helper.MIRBuilder;
4316   MachineRegisterInfo &MRI = *B.getMRI();
4317 
4318   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4319   auto IntrID = MI.getIntrinsicID();
4320   switch (IntrID) {
4321   case Intrinsic::amdgcn_if:
4322   case Intrinsic::amdgcn_else: {
4323     MachineInstr *Br = nullptr;
4324     MachineBasicBlock *UncondBrTarget = nullptr;
4325     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4326       const SIRegisterInfo *TRI
4327         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4328 
4329       Register Def = MI.getOperand(1).getReg();
4330       Register Use = MI.getOperand(3).getReg();
4331 
4332       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4333       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4334       if (IntrID == Intrinsic::amdgcn_if) {
4335         B.buildInstr(AMDGPU::SI_IF)
4336           .addDef(Def)
4337           .addUse(Use)
4338           .addMBB(UncondBrTarget);
4339       } else {
4340         B.buildInstr(AMDGPU::SI_ELSE)
4341           .addDef(Def)
4342           .addUse(Use)
4343           .addMBB(UncondBrTarget)
4344           .addImm(0);
4345       }
4346 
4347       if (Br) {
4348         Br->getOperand(0).setMBB(CondBrTarget);
4349       } else {
4350         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4351         // since we're swapping branch targets it needs to be reinserted.
4352         // FIXME: IRTranslator should probably not do this
4353         B.buildBr(*CondBrTarget);
4354       }
4355 
4356       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4357       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4358       MI.eraseFromParent();
4359       BrCond->eraseFromParent();
4360       return true;
4361     }
4362 
4363     return false;
4364   }
4365   case Intrinsic::amdgcn_loop: {
4366     MachineInstr *Br = nullptr;
4367     MachineBasicBlock *UncondBrTarget = nullptr;
4368     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4369       const SIRegisterInfo *TRI
4370         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4371 
4372       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4373       Register Reg = MI.getOperand(2).getReg();
4374 
4375       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4376       B.buildInstr(AMDGPU::SI_LOOP)
4377         .addUse(Reg)
4378         .addMBB(UncondBrTarget);
4379 
4380       if (Br)
4381         Br->getOperand(0).setMBB(CondBrTarget);
4382       else
4383         B.buildBr(*CondBrTarget);
4384 
4385       MI.eraseFromParent();
4386       BrCond->eraseFromParent();
4387       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4388       return true;
4389     }
4390 
4391     return false;
4392   }
4393   case Intrinsic::amdgcn_kernarg_segment_ptr:
4394     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4395       // This only makes sense to call in a kernel, so just lower to null.
4396       B.buildConstant(MI.getOperand(0).getReg(), 0);
4397       MI.eraseFromParent();
4398       return true;
4399     }
4400 
4401     return legalizePreloadedArgIntrin(
4402       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4403   case Intrinsic::amdgcn_implicitarg_ptr:
4404     return legalizeImplicitArgPtr(MI, MRI, B);
4405   case Intrinsic::amdgcn_workitem_id_x:
4406     return legalizePreloadedArgIntrin(MI, MRI, B,
4407                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4408   case Intrinsic::amdgcn_workitem_id_y:
4409     return legalizePreloadedArgIntrin(MI, MRI, B,
4410                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4411   case Intrinsic::amdgcn_workitem_id_z:
4412     return legalizePreloadedArgIntrin(MI, MRI, B,
4413                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4414   case Intrinsic::amdgcn_workgroup_id_x:
4415     return legalizePreloadedArgIntrin(MI, MRI, B,
4416                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4417   case Intrinsic::amdgcn_workgroup_id_y:
4418     return legalizePreloadedArgIntrin(MI, MRI, B,
4419                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4420   case Intrinsic::amdgcn_workgroup_id_z:
4421     return legalizePreloadedArgIntrin(MI, MRI, B,
4422                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4423   case Intrinsic::amdgcn_dispatch_ptr:
4424     return legalizePreloadedArgIntrin(MI, MRI, B,
4425                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4426   case Intrinsic::amdgcn_queue_ptr:
4427     return legalizePreloadedArgIntrin(MI, MRI, B,
4428                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4429   case Intrinsic::amdgcn_implicit_buffer_ptr:
4430     return legalizePreloadedArgIntrin(
4431       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4432   case Intrinsic::amdgcn_dispatch_id:
4433     return legalizePreloadedArgIntrin(MI, MRI, B,
4434                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4435   case Intrinsic::amdgcn_fdiv_fast:
4436     return legalizeFDIVFastIntrin(MI, MRI, B);
4437   case Intrinsic::amdgcn_is_shared:
4438     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4439   case Intrinsic::amdgcn_is_private:
4440     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4441   case Intrinsic::amdgcn_wavefrontsize: {
4442     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4443     MI.eraseFromParent();
4444     return true;
4445   }
4446   case Intrinsic::amdgcn_s_buffer_load:
4447     return legalizeSBufferLoad(Helper, MI);
4448   case Intrinsic::amdgcn_raw_buffer_store:
4449   case Intrinsic::amdgcn_struct_buffer_store:
4450     return legalizeBufferStore(MI, MRI, B, false, false);
4451   case Intrinsic::amdgcn_raw_buffer_store_format:
4452   case Intrinsic::amdgcn_struct_buffer_store_format:
4453     return legalizeBufferStore(MI, MRI, B, false, true);
4454   case Intrinsic::amdgcn_raw_tbuffer_store:
4455   case Intrinsic::amdgcn_struct_tbuffer_store:
4456     return legalizeBufferStore(MI, MRI, B, true, true);
4457   case Intrinsic::amdgcn_raw_buffer_load:
4458   case Intrinsic::amdgcn_struct_buffer_load:
4459     return legalizeBufferLoad(MI, MRI, B, false, false);
4460   case Intrinsic::amdgcn_raw_buffer_load_format:
4461   case Intrinsic::amdgcn_struct_buffer_load_format:
4462     return legalizeBufferLoad(MI, MRI, B, true, false);
4463   case Intrinsic::amdgcn_raw_tbuffer_load:
4464   case Intrinsic::amdgcn_struct_tbuffer_load:
4465     return legalizeBufferLoad(MI, MRI, B, true, true);
4466   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4467   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4468   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4469   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4470   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4471   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4472   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4473   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4474   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4475   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4476   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4477   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4478   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4479   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4480   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4481   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4482   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4483   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4484   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4485   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4486   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4487   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4488   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4489   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4490   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4491   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4492   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4493   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4494     return legalizeBufferAtomic(MI, B, IntrID);
4495   case Intrinsic::amdgcn_atomic_inc:
4496     return legalizeAtomicIncDec(MI, B, true);
4497   case Intrinsic::amdgcn_atomic_dec:
4498     return legalizeAtomicIncDec(MI, B, false);
4499   case Intrinsic::trap:
4500     return legalizeTrapIntrinsic(MI, MRI, B);
4501   case Intrinsic::debugtrap:
4502     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4503   case Intrinsic::amdgcn_rsq_clamp:
4504     return legalizeRsqClampIntrinsic(MI, MRI, B);
4505   case Intrinsic::amdgcn_ds_fadd:
4506   case Intrinsic::amdgcn_ds_fmin:
4507   case Intrinsic::amdgcn_ds_fmax:
4508     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4509   default: {
4510     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4511             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4512       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4513     return true;
4514   }
4515   }
4516 
4517   return true;
4518 }
4519