1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
364                                          const GCNTargetMachine &TM)
365   :  ST(ST_) {
366   using namespace TargetOpcode;
367 
368   auto GetAddrSpacePtr = [&TM](unsigned AS) {
369     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
370   };
371 
372   const LLT S1 = LLT::scalar(1);
373   const LLT S16 = LLT::scalar(16);
374   const LLT S32 = LLT::scalar(32);
375   const LLT S64 = LLT::scalar(64);
376   const LLT S128 = LLT::scalar(128);
377   const LLT S256 = LLT::scalar(256);
378   const LLT S512 = LLT::scalar(512);
379   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
380 
381   const LLT V2S16 = LLT::vector(2, 16);
382   const LLT V4S16 = LLT::vector(4, 16);
383 
384   const LLT V2S32 = LLT::vector(2, 32);
385   const LLT V3S32 = LLT::vector(3, 32);
386   const LLT V4S32 = LLT::vector(4, 32);
387   const LLT V5S32 = LLT::vector(5, 32);
388   const LLT V6S32 = LLT::vector(6, 32);
389   const LLT V7S32 = LLT::vector(7, 32);
390   const LLT V8S32 = LLT::vector(8, 32);
391   const LLT V9S32 = LLT::vector(9, 32);
392   const LLT V10S32 = LLT::vector(10, 32);
393   const LLT V11S32 = LLT::vector(11, 32);
394   const LLT V12S32 = LLT::vector(12, 32);
395   const LLT V13S32 = LLT::vector(13, 32);
396   const LLT V14S32 = LLT::vector(14, 32);
397   const LLT V15S32 = LLT::vector(15, 32);
398   const LLT V16S32 = LLT::vector(16, 32);
399   const LLT V32S32 = LLT::vector(32, 32);
400 
401   const LLT V2S64 = LLT::vector(2, 64);
402   const LLT V3S64 = LLT::vector(3, 64);
403   const LLT V4S64 = LLT::vector(4, 64);
404   const LLT V5S64 = LLT::vector(5, 64);
405   const LLT V6S64 = LLT::vector(6, 64);
406   const LLT V7S64 = LLT::vector(7, 64);
407   const LLT V8S64 = LLT::vector(8, 64);
408   const LLT V16S64 = LLT::vector(16, 64);
409 
410   std::initializer_list<LLT> AllS32Vectors =
411     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
412      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
413   std::initializer_list<LLT> AllS64Vectors =
414     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
415 
416   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
417   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
418   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
419   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
420   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
421   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
422   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
423 
424   const LLT CodePtr = FlatPtr;
425 
426   const std::initializer_list<LLT> AddrSpaces64 = {
427     GlobalPtr, ConstantPtr, FlatPtr
428   };
429 
430   const std::initializer_list<LLT> AddrSpaces32 = {
431     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
432   };
433 
434   const std::initializer_list<LLT> FPTypesBase = {
435     S32, S64
436   };
437 
438   const std::initializer_list<LLT> FPTypes16 = {
439     S32, S64, S16
440   };
441 
442   const std::initializer_list<LLT> FPTypesPK16 = {
443     S32, S64, S16, V2S16
444   };
445 
446   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
447 
448   setAction({G_BRCOND, S1}, Legal); // VCC branches
449   setAction({G_BRCOND, S32}, Legal); // SCC branches
450 
451   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
452   // elements for v3s16
453   getActionDefinitionsBuilder(G_PHI)
454     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
455     .legalFor(AllS32Vectors)
456     .legalFor(AllS64Vectors)
457     .legalFor(AddrSpaces64)
458     .legalFor(AddrSpaces32)
459     .legalIf(isPointer(0))
460     .clampScalar(0, S16, S256)
461     .widenScalarToNextPow2(0, 32)
462     .clampMaxNumElements(0, S32, 16)
463     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
464     .scalarize(0);
465 
466   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
467     // Full set of gfx9 features.
468     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
469       .legalFor({S32, S16, V2S16})
470       .clampScalar(0, S16, S32)
471       .clampMaxNumElements(0, S16, 2)
472       .scalarize(0)
473       .widenScalarToNextPow2(0, 32);
474 
475     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
476       .legalFor({S32, S16, V2S16}) // Clamp modifier
477       .minScalarOrElt(0, S16)
478       .clampMaxNumElements(0, S16, 2)
479       .scalarize(0)
480       .widenScalarToNextPow2(0, 32)
481       .lower();
482   } else if (ST.has16BitInsts()) {
483     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
484       .legalFor({S32, S16})
485       .clampScalar(0, S16, S32)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
488 
489     // Technically the saturating operations require clamp bit support, but this
490     // was introduced at the same time as 16-bit operations.
491     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
492       .legalFor({S32, S16}) // Clamp modifier
493       .minScalar(0, S16)
494       .scalarize(0)
495       .widenScalarToNextPow2(0, 16)
496       .lower();
497 
498     // We're just lowering this, but it helps get a better result to try to
499     // coerce to the desired type first.
500     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
501       .minScalar(0, S16)
502       .scalarize(0)
503       .lower();
504   } else {
505     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
506       .legalFor({S32})
507       .clampScalar(0, S32, S32)
508       .scalarize(0);
509 
510     if (ST.hasIntClamp()) {
511       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
512         .legalFor({S32}) // Clamp modifier.
513         .scalarize(0)
514         .minScalarOrElt(0, S32)
515         .lower();
516     } else {
517       // Clamp bit support was added in VI, along with 16-bit operations.
518       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
519         .minScalar(0, S32)
520         .scalarize(0)
521         .lower();
522     }
523 
524     // FIXME: DAG expansion gets better results. The widening uses the smaller
525     // range values and goes for the min/max lowering directly.
526     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
527       .minScalar(0, S32)
528       .scalarize(0)
529       .lower();
530   }
531 
532   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
533     .customFor({S32, S64})
534     .clampScalar(0, S32, S64)
535     .widenScalarToNextPow2(0, 32)
536     .scalarize(0);
537 
538   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
539     .legalFor({S32})
540     .clampScalar(0, S32, S32)
541     .scalarize(0);
542 
543   // Report legal for any types we can handle anywhere. For the cases only legal
544   // on the SALU, RegBankSelect will be able to re-legalize.
545   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
546     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
547     .clampScalar(0, S32, S64)
548     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
549     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
550     .widenScalarToNextPow2(0)
551     .scalarize(0);
552 
553   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
554                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
555     .legalFor({{S32, S1}, {S32, S32}})
556     .minScalar(0, S32)
557     // TODO: .scalarize(0)
558     .lower();
559 
560   getActionDefinitionsBuilder(G_BITCAST)
561     // Don't worry about the size constraint.
562     .legalIf(all(isRegisterType(0), isRegisterType(1)))
563     .lower();
564 
565 
566   getActionDefinitionsBuilder(G_CONSTANT)
567     .legalFor({S1, S32, S64, S16, GlobalPtr,
568                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
569     .legalIf(isPointer(0))
570     .clampScalar(0, S32, S64)
571     .widenScalarToNextPow2(0);
572 
573   getActionDefinitionsBuilder(G_FCONSTANT)
574     .legalFor({S32, S64, S16})
575     .clampScalar(0, S16, S64);
576 
577   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
578       .legalIf(isRegisterType(0))
579       // s1 and s16 are special cases because they have legal operations on
580       // them, but don't really occupy registers in the normal way.
581       .legalFor({S1, S16})
582       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
583       .clampScalarOrElt(0, S32, MaxScalar)
584       .widenScalarToNextPow2(0, 32)
585       .clampMaxNumElements(0, S32, 16);
586 
587   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
588 
589   // If the amount is divergent, we have to do a wave reduction to get the
590   // maximum value, so this is expanded during RegBankSelect.
591   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
592     .legalFor({{PrivatePtr, S32}});
593 
594   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
595     .customIf(typeIsNot(0, PrivatePtr));
596 
597   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
598 
599   auto &FPOpActions = getActionDefinitionsBuilder(
600     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
601     .legalFor({S32, S64});
602   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
603     .customFor({S32, S64});
604   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
605     .customFor({S32, S64});
606 
607   if (ST.has16BitInsts()) {
608     if (ST.hasVOP3PInsts())
609       FPOpActions.legalFor({S16, V2S16});
610     else
611       FPOpActions.legalFor({S16});
612 
613     TrigActions.customFor({S16});
614     FDIVActions.customFor({S16});
615   }
616 
617   auto &MinNumMaxNum = getActionDefinitionsBuilder({
618       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
619 
620   if (ST.hasVOP3PInsts()) {
621     MinNumMaxNum.customFor(FPTypesPK16)
622       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
623       .clampMaxNumElements(0, S16, 2)
624       .clampScalar(0, S16, S64)
625       .scalarize(0);
626   } else if (ST.has16BitInsts()) {
627     MinNumMaxNum.customFor(FPTypes16)
628       .clampScalar(0, S16, S64)
629       .scalarize(0);
630   } else {
631     MinNumMaxNum.customFor(FPTypesBase)
632       .clampScalar(0, S32, S64)
633       .scalarize(0);
634   }
635 
636   if (ST.hasVOP3PInsts())
637     FPOpActions.clampMaxNumElements(0, S16, 2);
638 
639   FPOpActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   TrigActions
644     .scalarize(0)
645     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
646 
647   FDIVActions
648     .scalarize(0)
649     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
650 
651   getActionDefinitionsBuilder({G_FNEG, G_FABS})
652     .legalFor(FPTypesPK16)
653     .clampMaxNumElements(0, S16, 2)
654     .scalarize(0)
655     .clampScalar(0, S16, S64);
656 
657   if (ST.has16BitInsts()) {
658     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
659       .legalFor({S32, S64, S16})
660       .scalarize(0)
661       .clampScalar(0, S16, S64);
662   } else {
663     getActionDefinitionsBuilder(G_FSQRT)
664       .legalFor({S32, S64})
665       .scalarize(0)
666       .clampScalar(0, S32, S64);
667 
668     if (ST.hasFractBug()) {
669       getActionDefinitionsBuilder(G_FFLOOR)
670         .customFor({S64})
671         .legalFor({S32, S64})
672         .scalarize(0)
673         .clampScalar(0, S32, S64);
674     } else {
675       getActionDefinitionsBuilder(G_FFLOOR)
676         .legalFor({S32, S64})
677         .scalarize(0)
678         .clampScalar(0, S32, S64);
679     }
680   }
681 
682   getActionDefinitionsBuilder(G_FPTRUNC)
683     .legalFor({{S32, S64}, {S16, S32}})
684     .scalarize(0)
685     .lower();
686 
687   getActionDefinitionsBuilder(G_FPEXT)
688     .legalFor({{S64, S32}, {S32, S16}})
689     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
690     .scalarize(0);
691 
692   getActionDefinitionsBuilder(G_FSUB)
693       // Use actual fsub instruction
694       .legalFor({S32})
695       // Must use fadd + fneg
696       .lowerFor({S64, S16, V2S16})
697       .scalarize(0)
698       .clampScalar(0, S32, S64);
699 
700   // Whether this is legal depends on the floating point mode for the function.
701   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
702   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
703     FMad.customFor({S32, S16});
704   else if (ST.hasMadMacF32Insts())
705     FMad.customFor({S32});
706   else if (ST.hasMadF16())
707     FMad.customFor({S16});
708   FMad.scalarize(0)
709       .lower();
710 
711   auto &FRem = getActionDefinitionsBuilder(G_FREM);
712   if (ST.has16BitInsts()) {
713     FRem.customFor({S16, S32, S64});
714   } else {
715     FRem.minScalar(0, S32)
716         .customFor({S32, S64});
717   }
718   FRem.scalarize(0);
719 
720   // TODO: Do we need to clamp maximum bitwidth?
721   getActionDefinitionsBuilder(G_TRUNC)
722     .legalIf(isScalar(0))
723     .legalFor({{V2S16, V2S32}})
724     .clampMaxNumElements(0, S16, 2)
725     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
726     // situations (like an invalid implicit use), we don't want to infinite loop
727     // in the legalizer.
728     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
729     .alwaysLegal();
730 
731   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
732     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
733                {S32, S1}, {S64, S1}, {S16, S1}})
734     .scalarize(0)
735     .clampScalar(0, S32, S64)
736     .widenScalarToNextPow2(1, 32);
737 
738   // TODO: Split s1->s64 during regbankselect for VALU.
739   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
740     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
741     .lowerFor({{S32, S64}})
742     .lowerIf(typeIs(1, S1))
743     .customFor({{S64, S64}});
744   if (ST.has16BitInsts())
745     IToFP.legalFor({{S16, S16}});
746   IToFP.clampScalar(1, S32, S64)
747        .minScalar(0, S32)
748        .scalarize(0)
749        .widenScalarToNextPow2(1);
750 
751   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
752     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
753     .customFor({{S64, S64}})
754     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
755   if (ST.has16BitInsts())
756     FPToI.legalFor({{S16, S16}});
757   else
758     FPToI.minScalar(1, S32);
759 
760   FPToI.minScalar(0, S32)
761        .scalarize(0)
762        .lower();
763 
764   // Lower roundeven into G_FRINT
765   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
766     .scalarize(0)
767     .lower();
768 
769   if (ST.has16BitInsts()) {
770     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
771       .legalFor({S16, S32, S64})
772       .clampScalar(0, S16, S64)
773       .scalarize(0);
774   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
775     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
776       .legalFor({S32, S64})
777       .clampScalar(0, S32, S64)
778       .scalarize(0);
779   } else {
780     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
781       .legalFor({S32})
782       .customFor({S64})
783       .clampScalar(0, S32, S64)
784       .scalarize(0);
785   }
786 
787   getActionDefinitionsBuilder(G_PTR_ADD)
788     .legalIf(all(isPointer(0), sameSize(0, 1)))
789     .scalarize(0)
790     .scalarSameSizeAs(1, 0);
791 
792   getActionDefinitionsBuilder(G_PTRMASK)
793     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
794     .scalarSameSizeAs(1, 0)
795     .scalarize(0);
796 
797   auto &CmpBuilder =
798     getActionDefinitionsBuilder(G_ICMP)
799     // The compare output type differs based on the register bank of the output,
800     // so make both s1 and s32 legal.
801     //
802     // Scalar compares producing output in scc will be promoted to s32, as that
803     // is the allocatable register type that will be needed for the copy from
804     // scc. This will be promoted during RegBankSelect, and we assume something
805     // before that won't try to use s32 result types.
806     //
807     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
808     // bank.
809     .legalForCartesianProduct(
810       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
811     .legalForCartesianProduct(
812       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
813   if (ST.has16BitInsts()) {
814     CmpBuilder.legalFor({{S1, S16}});
815   }
816 
817   CmpBuilder
818     .widenScalarToNextPow2(1)
819     .clampScalar(1, S32, S64)
820     .scalarize(0)
821     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
822 
823   getActionDefinitionsBuilder(G_FCMP)
824     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
825     .widenScalarToNextPow2(1)
826     .clampScalar(1, S32, S64)
827     .scalarize(0);
828 
829   // FIXME: fpow has a selection pattern that should move to custom lowering.
830   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
831   if (ST.has16BitInsts())
832     Exp2Ops.legalFor({S32, S16});
833   else
834     Exp2Ops.legalFor({S32});
835   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
836   Exp2Ops.scalarize(0);
837 
838   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
839   if (ST.has16BitInsts())
840     ExpOps.customFor({{S32}, {S16}});
841   else
842     ExpOps.customFor({S32});
843   ExpOps.clampScalar(0, MinScalarFPTy, S32)
844         .scalarize(0);
845 
846   getActionDefinitionsBuilder(G_FPOWI)
847     .clampScalar(0, MinScalarFPTy, S32)
848     .lower();
849 
850   // The 64-bit versions produce 32-bit results, but only on the SALU.
851   getActionDefinitionsBuilder(G_CTPOP)
852     .legalFor({{S32, S32}, {S32, S64}})
853     .clampScalar(0, S32, S32)
854     .clampScalar(1, S32, S64)
855     .scalarize(0)
856     .widenScalarToNextPow2(0, 32)
857     .widenScalarToNextPow2(1, 32);
858 
859   // The hardware instructions return a different result on 0 than the generic
860   // instructions expect. The hardware produces -1, but these produce the
861   // bitwidth.
862   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
863     .scalarize(0)
864     .clampScalar(0, S32, S32)
865     .clampScalar(1, S32, S64)
866     .widenScalarToNextPow2(0, 32)
867     .widenScalarToNextPow2(1, 32)
868     .lower();
869 
870   // The 64-bit versions produce 32-bit results, but only on the SALU.
871   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
872     .legalFor({{S32, S32}, {S32, S64}})
873     .clampScalar(0, S32, S32)
874     .clampScalar(1, S32, S64)
875     .scalarize(0)
876     .widenScalarToNextPow2(0, 32)
877     .widenScalarToNextPow2(1, 32);
878 
879   getActionDefinitionsBuilder(G_BITREVERSE)
880     .legalFor({S32})
881     .clampScalar(0, S32, S32)
882     .scalarize(0);
883 
884   if (ST.has16BitInsts()) {
885     getActionDefinitionsBuilder(G_BSWAP)
886       .legalFor({S16, S32, V2S16})
887       .clampMaxNumElements(0, S16, 2)
888       // FIXME: Fixing non-power-of-2 before clamp is workaround for
889       // narrowScalar limitation.
890       .widenScalarToNextPow2(0)
891       .clampScalar(0, S16, S32)
892       .scalarize(0);
893 
894     if (ST.hasVOP3PInsts()) {
895       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
896         .legalFor({S32, S16, V2S16})
897         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
898         .clampMaxNumElements(0, S16, 2)
899         .minScalar(0, S16)
900         .widenScalarToNextPow2(0)
901         .scalarize(0)
902         .lower();
903     } else {
904       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
905         .legalFor({S32, S16})
906         .widenScalarToNextPow2(0)
907         .minScalar(0, S16)
908         .scalarize(0)
909         .lower();
910     }
911   } else {
912     // TODO: Should have same legality without v_perm_b32
913     getActionDefinitionsBuilder(G_BSWAP)
914       .legalFor({S32})
915       .lowerIf(scalarNarrowerThan(0, 32))
916       // FIXME: Fixing non-power-of-2 before clamp is workaround for
917       // narrowScalar limitation.
918       .widenScalarToNextPow2(0)
919       .maxScalar(0, S32)
920       .scalarize(0)
921       .lower();
922 
923     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
924       .legalFor({S32})
925       .minScalar(0, S32)
926       .widenScalarToNextPow2(0)
927       .scalarize(0)
928       .lower();
929   }
930 
931   getActionDefinitionsBuilder(G_INTTOPTR)
932     // List the common cases
933     .legalForCartesianProduct(AddrSpaces64, {S64})
934     .legalForCartesianProduct(AddrSpaces32, {S32})
935     .scalarize(0)
936     // Accept any address space as long as the size matches
937     .legalIf(sameSize(0, 1))
938     .widenScalarIf(smallerThan(1, 0),
939       [](const LegalityQuery &Query) {
940         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
941       })
942     .narrowScalarIf(largerThan(1, 0),
943       [](const LegalityQuery &Query) {
944         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
945       });
946 
947   getActionDefinitionsBuilder(G_PTRTOINT)
948     // List the common cases
949     .legalForCartesianProduct(AddrSpaces64, {S64})
950     .legalForCartesianProduct(AddrSpaces32, {S32})
951     .scalarize(0)
952     // Accept any address space as long as the size matches
953     .legalIf(sameSize(0, 1))
954     .widenScalarIf(smallerThan(0, 1),
955       [](const LegalityQuery &Query) {
956         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
957       })
958     .narrowScalarIf(
959       largerThan(0, 1),
960       [](const LegalityQuery &Query) {
961         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
962       });
963 
964   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
965     .scalarize(0)
966     .custom();
967 
968   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
969                                     bool IsLoad) -> bool {
970     const LLT DstTy = Query.Types[0];
971 
972     // Split vector extloads.
973     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
974     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
975 
976     if (MemSize < DstTy.getSizeInBits())
977       MemSize = std::max(MemSize, AlignBits);
978 
979     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
980       return true;
981 
982     const LLT PtrTy = Query.Types[1];
983     unsigned AS = PtrTy.getAddressSpace();
984     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
985       return true;
986 
987     // Catch weird sized loads that don't evenly divide into the access sizes
988     // TODO: May be able to widen depending on alignment etc.
989     unsigned NumRegs = (MemSize + 31) / 32;
990     if (NumRegs == 3) {
991       if (!ST.hasDwordx3LoadStores())
992         return true;
993     } else {
994       // If the alignment allows, these should have been widened.
995       if (!isPowerOf2_32(NumRegs))
996         return true;
997     }
998 
999     if (AlignBits < MemSize) {
1000       const SITargetLowering *TLI = ST.getTargetLowering();
1001       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1002                                                       Align(AlignBits / 8));
1003     }
1004 
1005     return false;
1006   };
1007 
1008   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1009                                          unsigned Opc) -> bool {
1010     unsigned Size = Query.Types[0].getSizeInBits();
1011     if (isPowerOf2_32(Size))
1012       return false;
1013 
1014     if (Size == 96 && ST.hasDwordx3LoadStores())
1015       return false;
1016 
1017     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1018     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1019       return false;
1020 
1021     unsigned Align = Query.MMODescrs[0].AlignInBits;
1022     unsigned RoundedSize = NextPowerOf2(Size);
1023     return (Align >= RoundedSize);
1024   };
1025 
1026   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1027   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1028   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1029 
1030   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1031   // LDS
1032   // TODO: Unsupported flat for SI.
1033 
1034   for (unsigned Op : {G_LOAD, G_STORE}) {
1035     const bool IsStore = Op == G_STORE;
1036 
1037     auto &Actions = getActionDefinitionsBuilder(Op);
1038     // Explicitly list some common cases.
1039     // TODO: Does this help compile time at all?
1040     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1041                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1042                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1043                                       {S64, GlobalPtr, 64, GlobalAlign32},
1044                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1045                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1046                                       {S32, GlobalPtr, 8, GlobalAlign8},
1047                                       {S32, GlobalPtr, 16, GlobalAlign16},
1048 
1049                                       {S32, LocalPtr, 32, 32},
1050                                       {S64, LocalPtr, 64, 32},
1051                                       {V2S32, LocalPtr, 64, 32},
1052                                       {S32, LocalPtr, 8, 8},
1053                                       {S32, LocalPtr, 16, 16},
1054                                       {V2S16, LocalPtr, 32, 32},
1055 
1056                                       {S32, PrivatePtr, 32, 32},
1057                                       {S32, PrivatePtr, 8, 8},
1058                                       {S32, PrivatePtr, 16, 16},
1059                                       {V2S16, PrivatePtr, 32, 32},
1060 
1061                                       {S32, ConstantPtr, 32, GlobalAlign32},
1062                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1063                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1064                                       {S64, ConstantPtr, 64, GlobalAlign32},
1065                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1066     Actions.legalIf(
1067       [=](const LegalityQuery &Query) -> bool {
1068         return isLoadStoreLegal(ST, Query, Op);
1069       });
1070 
1071     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1072     // 64-bits.
1073     //
1074     // TODO: Should generalize bitcast action into coerce, which will also cover
1075     // inserting addrspacecasts.
1076     Actions.customIf(typeIs(1, Constant32Ptr));
1077 
1078     // Turn any illegal element vectors into something easier to deal
1079     // with. These will ultimately produce 32-bit scalar shifts to extract the
1080     // parts anyway.
1081     //
1082     // For odd 16-bit element vectors, prefer to split those into pieces with
1083     // 16-bit vector parts.
1084     Actions.bitcastIf(
1085       [=](const LegalityQuery &Query) -> bool {
1086         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1087                                           Query.MMODescrs[0].SizeInBits);
1088       }, bitcastToRegisterType(0));
1089 
1090     Actions
1091         .customIf(typeIs(1, Constant32Ptr))
1092         // Widen suitably aligned loads by loading extra elements.
1093         .moreElementsIf([=](const LegalityQuery &Query) {
1094             const LLT Ty = Query.Types[0];
1095             return Op == G_LOAD && Ty.isVector() &&
1096                    shouldWidenLoadResult(Query, Op);
1097           }, moreElementsToNextPow2(0))
1098         .widenScalarIf([=](const LegalityQuery &Query) {
1099             const LLT Ty = Query.Types[0];
1100             return Op == G_LOAD && !Ty.isVector() &&
1101                    shouldWidenLoadResult(Query, Op);
1102           }, widenScalarOrEltToNextPow2(0))
1103         .narrowScalarIf(
1104             [=](const LegalityQuery &Query) -> bool {
1105               return !Query.Types[0].isVector() &&
1106                      needToSplitMemOp(Query, Op == G_LOAD);
1107             },
1108             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1109               const LLT DstTy = Query.Types[0];
1110               const LLT PtrTy = Query.Types[1];
1111 
1112               const unsigned DstSize = DstTy.getSizeInBits();
1113               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1114 
1115               // Split extloads.
1116               if (DstSize > MemSize)
1117                 return std::make_pair(0, LLT::scalar(MemSize));
1118 
1119               if (!isPowerOf2_32(DstSize)) {
1120                 // We're probably decomposing an odd sized store. Try to split
1121                 // to the widest type. TODO: Account for alignment. As-is it
1122                 // should be OK, since the new parts will be further legalized.
1123                 unsigned FloorSize = PowerOf2Floor(DstSize);
1124                 return std::make_pair(0, LLT::scalar(FloorSize));
1125               }
1126 
1127               if (DstSize > 32 && (DstSize % 32 != 0)) {
1128                 // FIXME: Need a way to specify non-extload of larger size if
1129                 // suitably aligned.
1130                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1131               }
1132 
1133               unsigned MaxSize = maxSizeForAddrSpace(ST,
1134                                                      PtrTy.getAddressSpace(),
1135                                                      Op == G_LOAD);
1136               if (MemSize > MaxSize)
1137                 return std::make_pair(0, LLT::scalar(MaxSize));
1138 
1139               unsigned Align = Query.MMODescrs[0].AlignInBits;
1140               return std::make_pair(0, LLT::scalar(Align));
1141             })
1142         .fewerElementsIf(
1143             [=](const LegalityQuery &Query) -> bool {
1144               return Query.Types[0].isVector() &&
1145                      needToSplitMemOp(Query, Op == G_LOAD);
1146             },
1147             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1148               const LLT DstTy = Query.Types[0];
1149               const LLT PtrTy = Query.Types[1];
1150 
1151               LLT EltTy = DstTy.getElementType();
1152               unsigned MaxSize = maxSizeForAddrSpace(ST,
1153                                                      PtrTy.getAddressSpace(),
1154                                                      Op == G_LOAD);
1155 
1156               // FIXME: Handle widened to power of 2 results better. This ends
1157               // up scalarizing.
1158               // FIXME: 3 element stores scalarized on SI
1159 
1160               // Split if it's too large for the address space.
1161               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1162                 unsigned NumElts = DstTy.getNumElements();
1163                 unsigned EltSize = EltTy.getSizeInBits();
1164 
1165                 if (MaxSize % EltSize == 0) {
1166                   return std::make_pair(
1167                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1168                 }
1169 
1170                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1171 
1172                 // FIXME: Refine when odd breakdowns handled
1173                 // The scalars will need to be re-legalized.
1174                 if (NumPieces == 1 || NumPieces >= NumElts ||
1175                     NumElts % NumPieces != 0)
1176                   return std::make_pair(0, EltTy);
1177 
1178                 return std::make_pair(0,
1179                                       LLT::vector(NumElts / NumPieces, EltTy));
1180               }
1181 
1182               // FIXME: We could probably handle weird extending loads better.
1183               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1184               if (DstTy.getSizeInBits() > MemSize)
1185                 return std::make_pair(0, EltTy);
1186 
1187               unsigned EltSize = EltTy.getSizeInBits();
1188               unsigned DstSize = DstTy.getSizeInBits();
1189               if (!isPowerOf2_32(DstSize)) {
1190                 // We're probably decomposing an odd sized store. Try to split
1191                 // to the widest type. TODO: Account for alignment. As-is it
1192                 // should be OK, since the new parts will be further legalized.
1193                 unsigned FloorSize = PowerOf2Floor(DstSize);
1194                 return std::make_pair(
1195                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1196               }
1197 
1198               // Need to split because of alignment.
1199               unsigned Align = Query.MMODescrs[0].AlignInBits;
1200               if (EltSize > Align &&
1201                   (EltSize / Align < DstTy.getNumElements())) {
1202                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1203               }
1204 
1205               // May need relegalization for the scalars.
1206               return std::make_pair(0, EltTy);
1207             })
1208         .minScalar(0, S32);
1209 
1210     if (IsStore)
1211       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1212 
1213     // TODO: Need a bitcast lower option?
1214     Actions
1215         .widenScalarToNextPow2(0)
1216         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1217   }
1218 
1219   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1220                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1221                                                   {S32, GlobalPtr, 16, 2 * 8},
1222                                                   {S32, LocalPtr, 8, 8},
1223                                                   {S32, LocalPtr, 16, 16},
1224                                                   {S32, PrivatePtr, 8, 8},
1225                                                   {S32, PrivatePtr, 16, 16},
1226                                                   {S32, ConstantPtr, 8, 8},
1227                                                   {S32, ConstantPtr, 16, 2 * 8}});
1228   if (ST.hasFlatAddressSpace()) {
1229     ExtLoads.legalForTypesWithMemDesc(
1230         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1231   }
1232 
1233   ExtLoads.clampScalar(0, S32, S32)
1234           .widenScalarToNextPow2(0)
1235           .unsupportedIfMemSizeNotPow2()
1236           .lower();
1237 
1238   auto &Atomics = getActionDefinitionsBuilder(
1239     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1240      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1241      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1242      G_ATOMICRMW_UMIN})
1243     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1244                {S64, GlobalPtr}, {S64, LocalPtr},
1245                {S32, RegionPtr}, {S64, RegionPtr}});
1246   if (ST.hasFlatAddressSpace()) {
1247     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1248   }
1249 
1250   if (ST.hasLDSFPAtomics()) {
1251     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1252       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1253   }
1254 
1255   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1256   // demarshalling
1257   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1258     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1259                 {S32, FlatPtr}, {S64, FlatPtr}})
1260     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1261                {S32, RegionPtr}, {S64, RegionPtr}});
1262   // TODO: Pointer types, any 32-bit or 64-bit vector
1263 
1264   // Condition should be s32 for scalar, s1 for vector.
1265   getActionDefinitionsBuilder(G_SELECT)
1266     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1267           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1268           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1269     .clampScalar(0, S16, S64)
1270     .scalarize(1)
1271     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1272     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1273     .clampMaxNumElements(0, S32, 2)
1274     .clampMaxNumElements(0, LocalPtr, 2)
1275     .clampMaxNumElements(0, PrivatePtr, 2)
1276     .scalarize(0)
1277     .widenScalarToNextPow2(0)
1278     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1279 
1280   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1281   // be more flexible with the shift amount type.
1282   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1283     .legalFor({{S32, S32}, {S64, S32}});
1284   if (ST.has16BitInsts()) {
1285     if (ST.hasVOP3PInsts()) {
1286       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1287             .clampMaxNumElements(0, S16, 2);
1288     } else
1289       Shifts.legalFor({{S16, S16}});
1290 
1291     // TODO: Support 16-bit shift amounts for all types
1292     Shifts.widenScalarIf(
1293       [=](const LegalityQuery &Query) {
1294         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1295         // 32-bit amount.
1296         const LLT ValTy = Query.Types[0];
1297         const LLT AmountTy = Query.Types[1];
1298         return ValTy.getSizeInBits() <= 16 &&
1299                AmountTy.getSizeInBits() < 16;
1300       }, changeTo(1, S16));
1301     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1302     Shifts.clampScalar(1, S32, S32);
1303     Shifts.clampScalar(0, S16, S64);
1304     Shifts.widenScalarToNextPow2(0, 16);
1305 
1306     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1307       .minScalar(0, S16)
1308       .scalarize(0)
1309       .lower();
1310   } else {
1311     // Make sure we legalize the shift amount type first, as the general
1312     // expansion for the shifted type will produce much worse code if it hasn't
1313     // been truncated already.
1314     Shifts.clampScalar(1, S32, S32);
1315     Shifts.clampScalar(0, S32, S64);
1316     Shifts.widenScalarToNextPow2(0, 32);
1317 
1318     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1319       .minScalar(0, S32)
1320       .scalarize(0)
1321       .lower();
1322   }
1323   Shifts.scalarize(0);
1324 
1325   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1326     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1327     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1328     unsigned IdxTypeIdx = 2;
1329 
1330     getActionDefinitionsBuilder(Op)
1331       .customIf([=](const LegalityQuery &Query) {
1332           const LLT EltTy = Query.Types[EltTypeIdx];
1333           const LLT VecTy = Query.Types[VecTypeIdx];
1334           const LLT IdxTy = Query.Types[IdxTypeIdx];
1335           const unsigned EltSize = EltTy.getSizeInBits();
1336           return (EltSize == 32 || EltSize == 64) &&
1337                   VecTy.getSizeInBits() % 32 == 0 &&
1338                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1339                   IdxTy.getSizeInBits() == 32;
1340         })
1341       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1342                  bitcastToVectorElement32(VecTypeIdx))
1343       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1344       .bitcastIf(
1345         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1346         [=](const LegalityQuery &Query) {
1347           // For > 64-bit element types, try to turn this into a 64-bit
1348           // element vector since we may be able to do better indexing
1349           // if this is scalar. If not, fall back to 32.
1350           const LLT EltTy = Query.Types[EltTypeIdx];
1351           const LLT VecTy = Query.Types[VecTypeIdx];
1352           const unsigned DstEltSize = EltTy.getSizeInBits();
1353           const unsigned VecSize = VecTy.getSizeInBits();
1354 
1355           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1356           return std::make_pair(
1357             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1358         })
1359       .clampScalar(EltTypeIdx, S32, S64)
1360       .clampScalar(VecTypeIdx, S32, S64)
1361       .clampScalar(IdxTypeIdx, S32, S32)
1362       .clampMaxNumElements(1, S32, 32)
1363       // TODO: Clamp elements for 64-bit vectors?
1364       // It should only be necessary with variable indexes.
1365       // As a last resort, lower to the stack
1366       .lower();
1367   }
1368 
1369   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1370     .unsupportedIf([=](const LegalityQuery &Query) {
1371         const LLT &EltTy = Query.Types[1].getElementType();
1372         return Query.Types[0] != EltTy;
1373       });
1374 
1375   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1376     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1377     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1378 
1379     // FIXME: Doesn't handle extract of illegal sizes.
1380     getActionDefinitionsBuilder(Op)
1381       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1382       // FIXME: Multiples of 16 should not be legal.
1383       .legalIf([=](const LegalityQuery &Query) {
1384           const LLT BigTy = Query.Types[BigTyIdx];
1385           const LLT LitTy = Query.Types[LitTyIdx];
1386           return (BigTy.getSizeInBits() % 32 == 0) &&
1387                  (LitTy.getSizeInBits() % 16 == 0);
1388         })
1389       .widenScalarIf(
1390         [=](const LegalityQuery &Query) {
1391           const LLT BigTy = Query.Types[BigTyIdx];
1392           return (BigTy.getScalarSizeInBits() < 16);
1393         },
1394         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1395       .widenScalarIf(
1396         [=](const LegalityQuery &Query) {
1397           const LLT LitTy = Query.Types[LitTyIdx];
1398           return (LitTy.getScalarSizeInBits() < 16);
1399         },
1400         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1401       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1402       .widenScalarToNextPow2(BigTyIdx, 32);
1403 
1404   }
1405 
1406   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1407     .legalForCartesianProduct(AllS32Vectors, {S32})
1408     .legalForCartesianProduct(AllS64Vectors, {S64})
1409     .clampNumElements(0, V16S32, V32S32)
1410     .clampNumElements(0, V2S64, V16S64)
1411     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1412 
1413   if (ST.hasScalarPackInsts()) {
1414     BuildVector
1415       // FIXME: Should probably widen s1 vectors straight to s32
1416       .minScalarOrElt(0, S16)
1417       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1418       .minScalar(1, S32);
1419 
1420     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1421       .legalFor({V2S16, S32})
1422       .lower();
1423     BuildVector.minScalarOrElt(0, S32);
1424   } else {
1425     BuildVector.customFor({V2S16, S16});
1426     BuildVector.minScalarOrElt(0, S32);
1427 
1428     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1429       .customFor({V2S16, S32})
1430       .lower();
1431   }
1432 
1433   BuildVector.legalIf(isRegisterType(0));
1434 
1435   // FIXME: Clamp maximum size
1436   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1437     .legalIf(isRegisterType(0));
1438 
1439   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1440   // pre-legalize.
1441   if (ST.hasVOP3PInsts()) {
1442     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1443       .customFor({V2S16, V2S16})
1444       .lower();
1445   } else
1446     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1447 
1448   // Merge/Unmerge
1449   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1450     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1451     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1452 
1453     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1454       const LLT Ty = Query.Types[TypeIdx];
1455       if (Ty.isVector()) {
1456         const LLT &EltTy = Ty.getElementType();
1457         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1458           return true;
1459         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1460           return true;
1461       }
1462       return false;
1463     };
1464 
1465     auto &Builder = getActionDefinitionsBuilder(Op)
1466       .lowerFor({{S16, V2S16}})
1467       .lowerIf([=](const LegalityQuery &Query) {
1468           const LLT BigTy = Query.Types[BigTyIdx];
1469           return BigTy.getSizeInBits() == 32;
1470         })
1471       // Try to widen to s16 first for small types.
1472       // TODO: Only do this on targets with legal s16 shifts
1473       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1474       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1475       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1476       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1477                            elementTypeIs(1, S16)),
1478                        changeTo(1, V2S16))
1479       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1480       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1481       // valid.
1482       .clampScalar(LitTyIdx, S32, S512)
1483       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1484       // Break up vectors with weird elements into scalars
1485       .fewerElementsIf(
1486         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1487         scalarize(0))
1488       .fewerElementsIf(
1489         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1490         scalarize(1))
1491       .clampScalar(BigTyIdx, S32, MaxScalar);
1492 
1493     if (Op == G_MERGE_VALUES) {
1494       Builder.widenScalarIf(
1495         // TODO: Use 16-bit shifts if legal for 8-bit values?
1496         [=](const LegalityQuery &Query) {
1497           const LLT Ty = Query.Types[LitTyIdx];
1498           return Ty.getSizeInBits() < 32;
1499         },
1500         changeTo(LitTyIdx, S32));
1501     }
1502 
1503     Builder.widenScalarIf(
1504       [=](const LegalityQuery &Query) {
1505         const LLT Ty = Query.Types[BigTyIdx];
1506         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1507           Ty.getSizeInBits() % 16 != 0;
1508       },
1509       [=](const LegalityQuery &Query) {
1510         // Pick the next power of 2, or a multiple of 64 over 128.
1511         // Whichever is smaller.
1512         const LLT &Ty = Query.Types[BigTyIdx];
1513         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1514         if (NewSizeInBits >= 256) {
1515           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1516           if (RoundedTo < NewSizeInBits)
1517             NewSizeInBits = RoundedTo;
1518         }
1519         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1520       })
1521       .legalIf([=](const LegalityQuery &Query) {
1522           const LLT &BigTy = Query.Types[BigTyIdx];
1523           const LLT &LitTy = Query.Types[LitTyIdx];
1524 
1525           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1526             return false;
1527           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1528             return false;
1529 
1530           return BigTy.getSizeInBits() % 16 == 0 &&
1531                  LitTy.getSizeInBits() % 16 == 0 &&
1532                  BigTy.getSizeInBits() <= MaxRegisterSize;
1533         })
1534       // Any vectors left are the wrong size. Scalarize them.
1535       .scalarize(0)
1536       .scalarize(1);
1537   }
1538 
1539   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1540   // RegBankSelect.
1541   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1542     .legalFor({{S32}, {S64}});
1543 
1544   if (ST.hasVOP3PInsts()) {
1545     SextInReg.lowerFor({{V2S16}})
1546       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1547       // get more vector shift opportunities, since we'll get those when
1548       // expanded.
1549       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1550   } else if (ST.has16BitInsts()) {
1551     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1552   } else {
1553     // Prefer to promote to s32 before lowering if we don't have 16-bit
1554     // shifts. This avoid a lot of intermediate truncate and extend operations.
1555     SextInReg.lowerFor({{S32}, {S64}});
1556   }
1557 
1558   SextInReg
1559     .scalarize(0)
1560     .clampScalar(0, S32, S64)
1561     .lower();
1562 
1563   getActionDefinitionsBuilder(G_FSHR)
1564     .legalFor({{S32, S32}})
1565     .scalarize(0)
1566     .lower();
1567 
1568   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1569     .legalFor({S64});
1570 
1571   getActionDefinitionsBuilder(G_FENCE)
1572     .alwaysLegal();
1573 
1574   getActionDefinitionsBuilder({
1575       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1576       G_FCOPYSIGN,
1577 
1578       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1579       G_ATOMICRMW_NAND,
1580       G_ATOMICRMW_FSUB,
1581       G_READ_REGISTER,
1582       G_WRITE_REGISTER,
1583 
1584       G_SADDO, G_SSUBO,
1585 
1586        // TODO: Implement
1587       G_FMINIMUM, G_FMAXIMUM,
1588       G_FSHL
1589     }).lower();
1590 
1591   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1592         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1593         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1594     .unsupported();
1595 
1596   computeTables();
1597   verify(*ST.getInstrInfo());
1598 }
1599 
1600 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1601                                          MachineInstr &MI) const {
1602   MachineIRBuilder &B = Helper.MIRBuilder;
1603   MachineRegisterInfo &MRI = *B.getMRI();
1604 
1605   switch (MI.getOpcode()) {
1606   case TargetOpcode::G_ADDRSPACE_CAST:
1607     return legalizeAddrSpaceCast(MI, MRI, B);
1608   case TargetOpcode::G_FRINT:
1609     return legalizeFrint(MI, MRI, B);
1610   case TargetOpcode::G_FCEIL:
1611     return legalizeFceil(MI, MRI, B);
1612   case TargetOpcode::G_FREM:
1613     return legalizeFrem(MI, MRI, B);
1614   case TargetOpcode::G_INTRINSIC_TRUNC:
1615     return legalizeIntrinsicTrunc(MI, MRI, B);
1616   case TargetOpcode::G_SITOFP:
1617     return legalizeITOFP(MI, MRI, B, true);
1618   case TargetOpcode::G_UITOFP:
1619     return legalizeITOFP(MI, MRI, B, false);
1620   case TargetOpcode::G_FPTOSI:
1621     return legalizeFPTOI(MI, MRI, B, true);
1622   case TargetOpcode::G_FPTOUI:
1623     return legalizeFPTOI(MI, MRI, B, false);
1624   case TargetOpcode::G_FMINNUM:
1625   case TargetOpcode::G_FMAXNUM:
1626   case TargetOpcode::G_FMINNUM_IEEE:
1627   case TargetOpcode::G_FMAXNUM_IEEE:
1628     return legalizeMinNumMaxNum(Helper, MI);
1629   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1630     return legalizeExtractVectorElt(MI, MRI, B);
1631   case TargetOpcode::G_INSERT_VECTOR_ELT:
1632     return legalizeInsertVectorElt(MI, MRI, B);
1633   case TargetOpcode::G_SHUFFLE_VECTOR:
1634     return legalizeShuffleVector(MI, MRI, B);
1635   case TargetOpcode::G_FSIN:
1636   case TargetOpcode::G_FCOS:
1637     return legalizeSinCos(MI, MRI, B);
1638   case TargetOpcode::G_GLOBAL_VALUE:
1639     return legalizeGlobalValue(MI, MRI, B);
1640   case TargetOpcode::G_LOAD:
1641     return legalizeLoad(Helper, MI);
1642   case TargetOpcode::G_FMAD:
1643     return legalizeFMad(MI, MRI, B);
1644   case TargetOpcode::G_FDIV:
1645     return legalizeFDIV(MI, MRI, B);
1646   case TargetOpcode::G_UDIV:
1647   case TargetOpcode::G_UREM:
1648     return legalizeUDIV_UREM(MI, MRI, B);
1649   case TargetOpcode::G_SDIV:
1650   case TargetOpcode::G_SREM:
1651     return legalizeSDIV_SREM(MI, MRI, B);
1652   case TargetOpcode::G_ATOMIC_CMPXCHG:
1653     return legalizeAtomicCmpXChg(MI, MRI, B);
1654   case TargetOpcode::G_FLOG:
1655     return legalizeFlog(MI, B, numbers::ln2f);
1656   case TargetOpcode::G_FLOG10:
1657     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1658   case TargetOpcode::G_FEXP:
1659     return legalizeFExp(MI, B);
1660   case TargetOpcode::G_FPOW:
1661     return legalizeFPow(MI, B);
1662   case TargetOpcode::G_FFLOOR:
1663     return legalizeFFloor(MI, MRI, B);
1664   case TargetOpcode::G_BUILD_VECTOR:
1665     return legalizeBuildVector(MI, MRI, B);
1666   default:
1667     return false;
1668   }
1669 
1670   llvm_unreachable("expected switch to return");
1671 }
1672 
1673 Register AMDGPULegalizerInfo::getSegmentAperture(
1674   unsigned AS,
1675   MachineRegisterInfo &MRI,
1676   MachineIRBuilder &B) const {
1677   MachineFunction &MF = B.getMF();
1678   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1679   const LLT S32 = LLT::scalar(32);
1680 
1681   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1682 
1683   if (ST.hasApertureRegs()) {
1684     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1685     // getreg.
1686     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1687         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1688         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1689     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1690         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1691         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1692     unsigned Encoding =
1693         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1694         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1695         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1696 
1697     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1698 
1699     B.buildInstr(AMDGPU::S_GETREG_B32)
1700       .addDef(GetReg)
1701       .addImm(Encoding);
1702     MRI.setType(GetReg, S32);
1703 
1704     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1705     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1706   }
1707 
1708   Register QueuePtr = MRI.createGenericVirtualRegister(
1709     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1710 
1711   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1712     return Register();
1713 
1714   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1715   // private_segment_aperture_base_hi.
1716   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1717 
1718   // TODO: can we be smarter about machine pointer info?
1719   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1720   MachineMemOperand *MMO = MF.getMachineMemOperand(
1721       PtrInfo,
1722       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1723           MachineMemOperand::MOInvariant,
1724       4, commonAlignment(Align(64), StructOffset));
1725 
1726   Register LoadAddr;
1727 
1728   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1729   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1730 }
1731 
1732 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1733   MachineInstr &MI, MachineRegisterInfo &MRI,
1734   MachineIRBuilder &B) const {
1735   MachineFunction &MF = B.getMF();
1736 
1737   const LLT S32 = LLT::scalar(32);
1738   Register Dst = MI.getOperand(0).getReg();
1739   Register Src = MI.getOperand(1).getReg();
1740 
1741   LLT DstTy = MRI.getType(Dst);
1742   LLT SrcTy = MRI.getType(Src);
1743   unsigned DestAS = DstTy.getAddressSpace();
1744   unsigned SrcAS = SrcTy.getAddressSpace();
1745 
1746   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1747   // vector element.
1748   assert(!DstTy.isVector());
1749 
1750   const AMDGPUTargetMachine &TM
1751     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1752 
1753   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1754     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1755     return true;
1756   }
1757 
1758   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1759     // Truncate.
1760     B.buildExtract(Dst, Src, 0);
1761     MI.eraseFromParent();
1762     return true;
1763   }
1764 
1765   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1766     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1767     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1768 
1769     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1770     // another. Merge operands are required to be the same type, but creating an
1771     // extra ptrtoint would be kind of pointless.
1772     auto HighAddr = B.buildConstant(
1773       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1774     B.buildMerge(Dst, {Src, HighAddr});
1775     MI.eraseFromParent();
1776     return true;
1777   }
1778 
1779   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1780     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1781            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1782     unsigned NullVal = TM.getNullPointerValue(DestAS);
1783 
1784     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1785     auto FlatNull = B.buildConstant(SrcTy, 0);
1786 
1787     // Extract low 32-bits of the pointer.
1788     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1789 
1790     auto CmpRes =
1791         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1792     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1793 
1794     MI.eraseFromParent();
1795     return true;
1796   }
1797 
1798   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1799     return false;
1800 
1801   if (!ST.hasFlatAddressSpace())
1802     return false;
1803 
1804   auto SegmentNull =
1805       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1806   auto FlatNull =
1807       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1808 
1809   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1810   if (!ApertureReg.isValid())
1811     return false;
1812 
1813   auto CmpRes =
1814       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1815 
1816   // Coerce the type of the low half of the result so we can use merge_values.
1817   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1818 
1819   // TODO: Should we allow mismatched types but matching sizes in merges to
1820   // avoid the ptrtoint?
1821   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1822   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1823 
1824   MI.eraseFromParent();
1825   return true;
1826 }
1827 
1828 bool AMDGPULegalizerInfo::legalizeFrint(
1829   MachineInstr &MI, MachineRegisterInfo &MRI,
1830   MachineIRBuilder &B) const {
1831   Register Src = MI.getOperand(1).getReg();
1832   LLT Ty = MRI.getType(Src);
1833   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1834 
1835   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1836   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1837 
1838   auto C1 = B.buildFConstant(Ty, C1Val);
1839   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1840 
1841   // TODO: Should this propagate fast-math-flags?
1842   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1843   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1844 
1845   auto C2 = B.buildFConstant(Ty, C2Val);
1846   auto Fabs = B.buildFAbs(Ty, Src);
1847 
1848   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1849   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1850   MI.eraseFromParent();
1851   return true;
1852 }
1853 
1854 bool AMDGPULegalizerInfo::legalizeFceil(
1855   MachineInstr &MI, MachineRegisterInfo &MRI,
1856   MachineIRBuilder &B) const {
1857 
1858   const LLT S1 = LLT::scalar(1);
1859   const LLT S64 = LLT::scalar(64);
1860 
1861   Register Src = MI.getOperand(1).getReg();
1862   assert(MRI.getType(Src) == S64);
1863 
1864   // result = trunc(src)
1865   // if (src > 0.0 && src != result)
1866   //   result += 1.0
1867 
1868   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1869 
1870   const auto Zero = B.buildFConstant(S64, 0.0);
1871   const auto One = B.buildFConstant(S64, 1.0);
1872   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1873   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1874   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1875   auto Add = B.buildSelect(S64, And, One, Zero);
1876 
1877   // TODO: Should this propagate fast-math-flags?
1878   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1879   return true;
1880 }
1881 
1882 bool AMDGPULegalizerInfo::legalizeFrem(
1883   MachineInstr &MI, MachineRegisterInfo &MRI,
1884   MachineIRBuilder &B) const {
1885     Register DstReg = MI.getOperand(0).getReg();
1886     Register Src0Reg = MI.getOperand(1).getReg();
1887     Register Src1Reg = MI.getOperand(2).getReg();
1888     auto Flags = MI.getFlags();
1889     LLT Ty = MRI.getType(DstReg);
1890 
1891     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1892     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1893     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1894     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1895     MI.eraseFromParent();
1896     return true;
1897 }
1898 
1899 static MachineInstrBuilder extractF64Exponent(Register Hi,
1900                                               MachineIRBuilder &B) {
1901   const unsigned FractBits = 52;
1902   const unsigned ExpBits = 11;
1903   LLT S32 = LLT::scalar(32);
1904 
1905   auto Const0 = B.buildConstant(S32, FractBits - 32);
1906   auto Const1 = B.buildConstant(S32, ExpBits);
1907 
1908   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1909     .addUse(Hi)
1910     .addUse(Const0.getReg(0))
1911     .addUse(Const1.getReg(0));
1912 
1913   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1914 }
1915 
1916 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1917   MachineInstr &MI, MachineRegisterInfo &MRI,
1918   MachineIRBuilder &B) const {
1919   const LLT S1 = LLT::scalar(1);
1920   const LLT S32 = LLT::scalar(32);
1921   const LLT S64 = LLT::scalar(64);
1922 
1923   Register Src = MI.getOperand(1).getReg();
1924   assert(MRI.getType(Src) == S64);
1925 
1926   // TODO: Should this use extract since the low half is unused?
1927   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1928   Register Hi = Unmerge.getReg(1);
1929 
1930   // Extract the upper half, since this is where we will find the sign and
1931   // exponent.
1932   auto Exp = extractF64Exponent(Hi, B);
1933 
1934   const unsigned FractBits = 52;
1935 
1936   // Extract the sign bit.
1937   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1938   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1939 
1940   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1941 
1942   const auto Zero32 = B.buildConstant(S32, 0);
1943 
1944   // Extend back to 64-bits.
1945   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1946 
1947   auto Shr = B.buildAShr(S64, FractMask, Exp);
1948   auto Not = B.buildNot(S64, Shr);
1949   auto Tmp0 = B.buildAnd(S64, Src, Not);
1950   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1951 
1952   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1953   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1954 
1955   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1956   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1957   MI.eraseFromParent();
1958   return true;
1959 }
1960 
1961 bool AMDGPULegalizerInfo::legalizeITOFP(
1962   MachineInstr &MI, MachineRegisterInfo &MRI,
1963   MachineIRBuilder &B, bool Signed) const {
1964 
1965   Register Dst = MI.getOperand(0).getReg();
1966   Register Src = MI.getOperand(1).getReg();
1967 
1968   const LLT S64 = LLT::scalar(64);
1969   const LLT S32 = LLT::scalar(32);
1970 
1971   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1972 
1973   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1974 
1975   auto CvtHi = Signed ?
1976     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1977     B.buildUITOFP(S64, Unmerge.getReg(1));
1978 
1979   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1980 
1981   auto ThirtyTwo = B.buildConstant(S32, 32);
1982   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1983     .addUse(CvtHi.getReg(0))
1984     .addUse(ThirtyTwo.getReg(0));
1985 
1986   // TODO: Should this propagate fast-math-flags?
1987   B.buildFAdd(Dst, LdExp, CvtLo);
1988   MI.eraseFromParent();
1989   return true;
1990 }
1991 
1992 // TODO: Copied from DAG implementation. Verify logic and document how this
1993 // actually works.
1994 bool AMDGPULegalizerInfo::legalizeFPTOI(
1995   MachineInstr &MI, MachineRegisterInfo &MRI,
1996   MachineIRBuilder &B, bool Signed) const {
1997 
1998   Register Dst = MI.getOperand(0).getReg();
1999   Register Src = MI.getOperand(1).getReg();
2000 
2001   const LLT S64 = LLT::scalar(64);
2002   const LLT S32 = LLT::scalar(32);
2003 
2004   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2005 
2006   unsigned Flags = MI.getFlags();
2007 
2008   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2009   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2010   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2011 
2012   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2013   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2014   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2015 
2016   auto Hi = Signed ?
2017     B.buildFPTOSI(S32, FloorMul) :
2018     B.buildFPTOUI(S32, FloorMul);
2019   auto Lo = B.buildFPTOUI(S32, Fma);
2020 
2021   B.buildMerge(Dst, { Lo, Hi });
2022   MI.eraseFromParent();
2023 
2024   return true;
2025 }
2026 
2027 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2028                                                MachineInstr &MI) const {
2029   MachineFunction &MF = Helper.MIRBuilder.getMF();
2030   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2031 
2032   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2033                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2034 
2035   // With ieee_mode disabled, the instructions have the correct behavior
2036   // already for G_FMINNUM/G_FMAXNUM
2037   if (!MFI->getMode().IEEE)
2038     return !IsIEEEOp;
2039 
2040   if (IsIEEEOp)
2041     return true;
2042 
2043   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2044 }
2045 
2046 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2047   MachineInstr &MI, MachineRegisterInfo &MRI,
2048   MachineIRBuilder &B) const {
2049   // TODO: Should move some of this into LegalizerHelper.
2050 
2051   // TODO: Promote dynamic indexing of s16 to s32
2052 
2053   // FIXME: Artifact combiner probably should have replaced the truncated
2054   // constant before this, so we shouldn't need
2055   // getConstantVRegValWithLookThrough.
2056   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2057     MI.getOperand(2).getReg(), MRI);
2058   if (!IdxVal) // Dynamic case will be selected to register indexing.
2059     return true;
2060 
2061   Register Dst = MI.getOperand(0).getReg();
2062   Register Vec = MI.getOperand(1).getReg();
2063 
2064   LLT VecTy = MRI.getType(Vec);
2065   LLT EltTy = VecTy.getElementType();
2066   assert(EltTy == MRI.getType(Dst));
2067 
2068   if (IdxVal->Value < VecTy.getNumElements())
2069     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2070   else
2071     B.buildUndef(Dst);
2072 
2073   MI.eraseFromParent();
2074   return true;
2075 }
2076 
2077 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2078   MachineInstr &MI, MachineRegisterInfo &MRI,
2079   MachineIRBuilder &B) const {
2080   // TODO: Should move some of this into LegalizerHelper.
2081 
2082   // TODO: Promote dynamic indexing of s16 to s32
2083 
2084   // FIXME: Artifact combiner probably should have replaced the truncated
2085   // constant before this, so we shouldn't need
2086   // getConstantVRegValWithLookThrough.
2087   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2088     MI.getOperand(3).getReg(), MRI);
2089   if (!IdxVal) // Dynamic case will be selected to register indexing.
2090     return true;
2091 
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register Vec = MI.getOperand(1).getReg();
2094   Register Ins = MI.getOperand(2).getReg();
2095 
2096   LLT VecTy = MRI.getType(Vec);
2097   LLT EltTy = VecTy.getElementType();
2098   assert(EltTy == MRI.getType(Ins));
2099 
2100   if (IdxVal->Value < VecTy.getNumElements())
2101     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2102   else
2103     B.buildUndef(Dst);
2104 
2105   MI.eraseFromParent();
2106   return true;
2107 }
2108 
2109 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2110   MachineInstr &MI, MachineRegisterInfo &MRI,
2111   MachineIRBuilder &B) const {
2112   const LLT V2S16 = LLT::vector(2, 16);
2113 
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register Src0 = MI.getOperand(1).getReg();
2116   LLT DstTy = MRI.getType(Dst);
2117   LLT SrcTy = MRI.getType(Src0);
2118 
2119   if (SrcTy == V2S16 && DstTy == V2S16 &&
2120       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2121     return true;
2122 
2123   MachineIRBuilder HelperBuilder(MI);
2124   GISelObserverWrapper DummyObserver;
2125   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2126   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2127 }
2128 
2129 bool AMDGPULegalizerInfo::legalizeSinCos(
2130   MachineInstr &MI, MachineRegisterInfo &MRI,
2131   MachineIRBuilder &B) const {
2132 
2133   Register DstReg = MI.getOperand(0).getReg();
2134   Register SrcReg = MI.getOperand(1).getReg();
2135   LLT Ty = MRI.getType(DstReg);
2136   unsigned Flags = MI.getFlags();
2137 
2138   Register TrigVal;
2139   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2140   if (ST.hasTrigReducedRange()) {
2141     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2142     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2143       .addUse(MulVal.getReg(0))
2144       .setMIFlags(Flags).getReg(0);
2145   } else
2146     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2147 
2148   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2149     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2150   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2151     .addUse(TrigVal)
2152     .setMIFlags(Flags);
2153   MI.eraseFromParent();
2154   return true;
2155 }
2156 
2157 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2158                                                   MachineIRBuilder &B,
2159                                                   const GlobalValue *GV,
2160                                                   int64_t Offset,
2161                                                   unsigned GAFlags) const {
2162   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2163   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2164   // to the following code sequence:
2165   //
2166   // For constant address space:
2167   //   s_getpc_b64 s[0:1]
2168   //   s_add_u32 s0, s0, $symbol
2169   //   s_addc_u32 s1, s1, 0
2170   //
2171   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2172   //   a fixup or relocation is emitted to replace $symbol with a literal
2173   //   constant, which is a pc-relative offset from the encoding of the $symbol
2174   //   operand to the global variable.
2175   //
2176   // For global address space:
2177   //   s_getpc_b64 s[0:1]
2178   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2179   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2180   //
2181   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2182   //   fixups or relocations are emitted to replace $symbol@*@lo and
2183   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2184   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2185   //   operand to the global variable.
2186   //
2187   // What we want here is an offset from the value returned by s_getpc
2188   // (which is the address of the s_add_u32 instruction) to the global
2189   // variable, but since the encoding of $symbol starts 4 bytes after the start
2190   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2191   // small. This requires us to add 4 to the global variable offset in order to
2192   // compute the correct address.
2193 
2194   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2195 
2196   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2197     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2198 
2199   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2200     .addDef(PCReg);
2201 
2202   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2203   if (GAFlags == SIInstrInfo::MO_NONE)
2204     MIB.addImm(0);
2205   else
2206     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2207 
2208   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2209 
2210   if (PtrTy.getSizeInBits() == 32)
2211     B.buildExtract(DstReg, PCReg, 0);
2212   return true;
2213  }
2214 
2215 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2216   MachineInstr &MI, MachineRegisterInfo &MRI,
2217   MachineIRBuilder &B) const {
2218   Register DstReg = MI.getOperand(0).getReg();
2219   LLT Ty = MRI.getType(DstReg);
2220   unsigned AS = Ty.getAddressSpace();
2221 
2222   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2223   MachineFunction &MF = B.getMF();
2224   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2225 
2226   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2227     if (!MFI->isEntryFunction()) {
2228       const Function &Fn = MF.getFunction();
2229       DiagnosticInfoUnsupported BadLDSDecl(
2230         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2231         DS_Warning);
2232       Fn.getContext().diagnose(BadLDSDecl);
2233 
2234       // We currently don't have a way to correctly allocate LDS objects that
2235       // aren't directly associated with a kernel. We do force inlining of
2236       // functions that use local objects. However, if these dead functions are
2237       // not eliminated, we don't want a compile time error. Just emit a warning
2238       // and a trap, since there should be no callable path here.
2239       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2240       B.buildUndef(DstReg);
2241       MI.eraseFromParent();
2242       return true;
2243     }
2244 
2245     // TODO: We could emit code to handle the initialization somewhere.
2246     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2247       const SITargetLowering *TLI = ST.getTargetLowering();
2248       if (!TLI->shouldUseLDSConstAddress(GV)) {
2249         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2250         return true; // Leave in place;
2251       }
2252 
2253       B.buildConstant(
2254           DstReg,
2255           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2256       MI.eraseFromParent();
2257       return true;
2258     }
2259 
2260     const Function &Fn = MF.getFunction();
2261     DiagnosticInfoUnsupported BadInit(
2262       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2263     Fn.getContext().diagnose(BadInit);
2264     return true;
2265   }
2266 
2267   const SITargetLowering *TLI = ST.getTargetLowering();
2268 
2269   if (TLI->shouldEmitFixup(GV)) {
2270     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2271     MI.eraseFromParent();
2272     return true;
2273   }
2274 
2275   if (TLI->shouldEmitPCReloc(GV)) {
2276     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2277     MI.eraseFromParent();
2278     return true;
2279   }
2280 
2281   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2282   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2283 
2284   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2285       MachinePointerInfo::getGOT(MF),
2286       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2287           MachineMemOperand::MOInvariant,
2288       8 /*Size*/, Align(8));
2289 
2290   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2291 
2292   if (Ty.getSizeInBits() == 32) {
2293     // Truncate if this is a 32-bit constant adrdess.
2294     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2295     B.buildExtract(DstReg, Load, 0);
2296   } else
2297     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2298 
2299   MI.eraseFromParent();
2300   return true;
2301 }
2302 
2303 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2304                                        MachineInstr &MI) const {
2305   MachineIRBuilder &B = Helper.MIRBuilder;
2306   MachineRegisterInfo &MRI = *B.getMRI();
2307   GISelChangeObserver &Observer = Helper.Observer;
2308 
2309   Register PtrReg = MI.getOperand(1).getReg();
2310   LLT PtrTy = MRI.getType(PtrReg);
2311   unsigned AddrSpace = PtrTy.getAddressSpace();
2312 
2313   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2314     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2315     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2316     Observer.changingInstr(MI);
2317     MI.getOperand(1).setReg(Cast.getReg(0));
2318     Observer.changedInstr(MI);
2319     return true;
2320   }
2321 
2322   return false;
2323 }
2324 
2325 bool AMDGPULegalizerInfo::legalizeFMad(
2326   MachineInstr &MI, MachineRegisterInfo &MRI,
2327   MachineIRBuilder &B) const {
2328   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2329   assert(Ty.isScalar());
2330 
2331   MachineFunction &MF = B.getMF();
2332   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2333 
2334   // TODO: Always legal with future ftz flag.
2335   // FIXME: Do we need just output?
2336   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2337     return true;
2338   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2339     return true;
2340 
2341   MachineIRBuilder HelperBuilder(MI);
2342   GISelObserverWrapper DummyObserver;
2343   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2344   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2345 }
2346 
2347 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2348   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2349   Register DstReg = MI.getOperand(0).getReg();
2350   Register PtrReg = MI.getOperand(1).getReg();
2351   Register CmpVal = MI.getOperand(2).getReg();
2352   Register NewVal = MI.getOperand(3).getReg();
2353 
2354   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2355          "this should not have been custom lowered");
2356 
2357   LLT ValTy = MRI.getType(CmpVal);
2358   LLT VecTy = LLT::vector(2, ValTy);
2359 
2360   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2361 
2362   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2363     .addDef(DstReg)
2364     .addUse(PtrReg)
2365     .addUse(PackedVal)
2366     .setMemRefs(MI.memoperands());
2367 
2368   MI.eraseFromParent();
2369   return true;
2370 }
2371 
2372 bool AMDGPULegalizerInfo::legalizeFlog(
2373   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2374   Register Dst = MI.getOperand(0).getReg();
2375   Register Src = MI.getOperand(1).getReg();
2376   LLT Ty = B.getMRI()->getType(Dst);
2377   unsigned Flags = MI.getFlags();
2378 
2379   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2380   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2381 
2382   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2383   MI.eraseFromParent();
2384   return true;
2385 }
2386 
2387 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2388                                        MachineIRBuilder &B) const {
2389   Register Dst = MI.getOperand(0).getReg();
2390   Register Src = MI.getOperand(1).getReg();
2391   unsigned Flags = MI.getFlags();
2392   LLT Ty = B.getMRI()->getType(Dst);
2393 
2394   auto K = B.buildFConstant(Ty, numbers::log2e);
2395   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2396   B.buildFExp2(Dst, Mul, Flags);
2397   MI.eraseFromParent();
2398   return true;
2399 }
2400 
2401 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2402                                        MachineIRBuilder &B) const {
2403   Register Dst = MI.getOperand(0).getReg();
2404   Register Src0 = MI.getOperand(1).getReg();
2405   Register Src1 = MI.getOperand(2).getReg();
2406   unsigned Flags = MI.getFlags();
2407   LLT Ty = B.getMRI()->getType(Dst);
2408   const LLT S16 = LLT::scalar(16);
2409   const LLT S32 = LLT::scalar(32);
2410 
2411   if (Ty == S32) {
2412     auto Log = B.buildFLog2(S32, Src0, Flags);
2413     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2414       .addUse(Log.getReg(0))
2415       .addUse(Src1)
2416       .setMIFlags(Flags);
2417     B.buildFExp2(Dst, Mul, Flags);
2418   } else if (Ty == S16) {
2419     // There's no f16 fmul_legacy, so we need to convert for it.
2420     auto Log = B.buildFLog2(S16, Src0, Flags);
2421     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2422     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2423     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2424       .addUse(Ext0.getReg(0))
2425       .addUse(Ext1.getReg(0))
2426       .setMIFlags(Flags);
2427 
2428     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2429   } else
2430     return false;
2431 
2432   MI.eraseFromParent();
2433   return true;
2434 }
2435 
2436 // Find a source register, ignoring any possible source modifiers.
2437 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2438   Register ModSrc = OrigSrc;
2439   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2440     ModSrc = SrcFNeg->getOperand(1).getReg();
2441     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2442       ModSrc = SrcFAbs->getOperand(1).getReg();
2443   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2444     ModSrc = SrcFAbs->getOperand(1).getReg();
2445   return ModSrc;
2446 }
2447 
2448 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2449                                          MachineRegisterInfo &MRI,
2450                                          MachineIRBuilder &B) const {
2451 
2452   const LLT S1 = LLT::scalar(1);
2453   const LLT S64 = LLT::scalar(64);
2454   Register Dst = MI.getOperand(0).getReg();
2455   Register OrigSrc = MI.getOperand(1).getReg();
2456   unsigned Flags = MI.getFlags();
2457   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2458          "this should not have been custom lowered");
2459 
2460   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2461   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2462   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2463   // V_FRACT bug is:
2464   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2465   //
2466   // Convert floor(x) to (x - fract(x))
2467 
2468   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2469     .addUse(OrigSrc)
2470     .setMIFlags(Flags);
2471 
2472   // Give source modifier matching some assistance before obscuring a foldable
2473   // pattern.
2474 
2475   // TODO: We can avoid the neg on the fract? The input sign to fract
2476   // shouldn't matter?
2477   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2478 
2479   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2480 
2481   Register Min = MRI.createGenericVirtualRegister(S64);
2482 
2483   // We don't need to concern ourselves with the snan handling difference, so
2484   // use the one which will directly select.
2485   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2486   if (MFI->getMode().IEEE)
2487     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2488   else
2489     B.buildFMinNum(Min, Fract, Const, Flags);
2490 
2491   Register CorrectedFract = Min;
2492   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2493     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2494     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2495   }
2496 
2497   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2498   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2499 
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 // Turn an illegal packed v2s16 build vector into bit operations.
2505 // TODO: This should probably be a bitcast action in LegalizerHelper.
2506 bool AMDGPULegalizerInfo::legalizeBuildVector(
2507   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2508   Register Dst = MI.getOperand(0).getReg();
2509   const LLT S32 = LLT::scalar(32);
2510   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2511 
2512   Register Src0 = MI.getOperand(1).getReg();
2513   Register Src1 = MI.getOperand(2).getReg();
2514   assert(MRI.getType(Src0) == LLT::scalar(16));
2515 
2516   auto Merge = B.buildMerge(S32, {Src0, Src1});
2517   B.buildBitcast(Dst, Merge);
2518 
2519   MI.eraseFromParent();
2520   return true;
2521 }
2522 
2523 // Return the use branch instruction, otherwise null if the usage is invalid.
2524 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2525                                        MachineRegisterInfo &MRI,
2526                                        MachineInstr *&Br,
2527                                        MachineBasicBlock *&UncondBrTarget) {
2528   Register CondDef = MI.getOperand(0).getReg();
2529   if (!MRI.hasOneNonDBGUse(CondDef))
2530     return nullptr;
2531 
2532   MachineBasicBlock *Parent = MI.getParent();
2533   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2534   if (UseMI.getParent() != Parent ||
2535       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2536     return nullptr;
2537 
2538   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2539   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2540   if (Next == Parent->end()) {
2541     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2542     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2543       return nullptr;
2544     UncondBrTarget = &*NextMBB;
2545   } else {
2546     if (Next->getOpcode() != AMDGPU::G_BR)
2547       return nullptr;
2548     Br = &*Next;
2549     UncondBrTarget = Br->getOperand(0).getMBB();
2550   }
2551 
2552   return &UseMI;
2553 }
2554 
2555 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2556                                          const ArgDescriptor *Arg,
2557                                          const TargetRegisterClass *ArgRC,
2558                                          LLT ArgTy) const {
2559   MCRegister SrcReg = Arg->getRegister();
2560   assert(SrcReg.isPhysical() && "Physical register expected");
2561   assert(DstReg.isVirtual() && "Virtual register expected");
2562 
2563   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2564                                              ArgTy);
2565   if (Arg->isMasked()) {
2566     // TODO: Should we try to emit this once in the entry block?
2567     const LLT S32 = LLT::scalar(32);
2568     const unsigned Mask = Arg->getMask();
2569     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2570 
2571     Register AndMaskSrc = LiveIn;
2572 
2573     if (Shift != 0) {
2574       auto ShiftAmt = B.buildConstant(S32, Shift);
2575       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2576     }
2577 
2578     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2579   } else {
2580     B.buildCopy(DstReg, LiveIn);
2581   }
2582 
2583   return true;
2584 }
2585 
2586 bool AMDGPULegalizerInfo::loadInputValue(
2587     Register DstReg, MachineIRBuilder &B,
2588     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2589   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2590   const ArgDescriptor *Arg;
2591   const TargetRegisterClass *ArgRC;
2592   LLT ArgTy;
2593   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2594 
2595   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2596     return false; // TODO: Handle these
2597   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2598 }
2599 
2600 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2601     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2602     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2603   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2604     return false;
2605 
2606   MI.eraseFromParent();
2607   return true;
2608 }
2609 
2610 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2611                                        MachineRegisterInfo &MRI,
2612                                        MachineIRBuilder &B) const {
2613   Register Dst = MI.getOperand(0).getReg();
2614   LLT DstTy = MRI.getType(Dst);
2615   LLT S16 = LLT::scalar(16);
2616   LLT S32 = LLT::scalar(32);
2617   LLT S64 = LLT::scalar(64);
2618 
2619   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2620     return true;
2621 
2622   if (DstTy == S16)
2623     return legalizeFDIV16(MI, MRI, B);
2624   if (DstTy == S32)
2625     return legalizeFDIV32(MI, MRI, B);
2626   if (DstTy == S64)
2627     return legalizeFDIV64(MI, MRI, B);
2628 
2629   return false;
2630 }
2631 
2632 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2633                                                   Register DstReg,
2634                                                   Register X,
2635                                                   Register Y,
2636                                                   bool IsDiv) const {
2637   const LLT S1 = LLT::scalar(1);
2638   const LLT S32 = LLT::scalar(32);
2639 
2640   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2641   // algorithm used here.
2642 
2643   // Initial estimate of inv(y).
2644   auto FloatY = B.buildUITOFP(S32, Y);
2645   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2646   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2647   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2648   auto Z = B.buildFPTOUI(S32, ScaledY);
2649 
2650   // One round of UNR.
2651   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2652   auto NegYZ = B.buildMul(S32, NegY, Z);
2653   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2654 
2655   // Quotient/remainder estimate.
2656   auto Q = B.buildUMulH(S32, X, Z);
2657   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2658 
2659   // First quotient/remainder refinement.
2660   auto One = B.buildConstant(S32, 1);
2661   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2662   if (IsDiv)
2663     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2664   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2665 
2666   // Second quotient/remainder refinement.
2667   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2668   if (IsDiv)
2669     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2670   else
2671     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2672 }
2673 
2674 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2675                                               MachineRegisterInfo &MRI,
2676                                               MachineIRBuilder &B) const {
2677   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2678   Register DstReg = MI.getOperand(0).getReg();
2679   Register Num = MI.getOperand(1).getReg();
2680   Register Den = MI.getOperand(2).getReg();
2681   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2682   MI.eraseFromParent();
2683   return true;
2684 }
2685 
2686 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2687 //
2688 // Return lo, hi of result
2689 //
2690 // %cvt.lo = G_UITOFP Val.lo
2691 // %cvt.hi = G_UITOFP Val.hi
2692 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2693 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2694 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2695 // %mul2 = G_FMUL %mul1, 2**(-32)
2696 // %trunc = G_INTRINSIC_TRUNC %mul2
2697 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2698 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2699 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2700                                                        Register Val) {
2701   const LLT S32 = LLT::scalar(32);
2702   auto Unmerge = B.buildUnmerge(S32, Val);
2703 
2704   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2705   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2706 
2707   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2708                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2709 
2710   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2711   auto Mul1 =
2712       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2713 
2714   // 2**(-32)
2715   auto Mul2 =
2716       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2717   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2718 
2719   // -(2**32)
2720   auto Mad2 = B.buildFMAD(S32, Trunc,
2721                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2722 
2723   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2724   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2725 
2726   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2727 }
2728 
2729 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2730                                                   Register DstReg,
2731                                                   Register Numer,
2732                                                   Register Denom,
2733                                                   bool IsDiv) const {
2734   const LLT S32 = LLT::scalar(32);
2735   const LLT S64 = LLT::scalar(64);
2736   const LLT S1 = LLT::scalar(1);
2737   Register RcpLo, RcpHi;
2738 
2739   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2740 
2741   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2742 
2743   auto Zero64 = B.buildConstant(S64, 0);
2744   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2745 
2746   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2747   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2748 
2749   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2750   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2751   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2752 
2753   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2754   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2755   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2756   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2757 
2758   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2759   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2760   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2761   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2762   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2763 
2764   auto Zero32 = B.buildConstant(S32, 0);
2765   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2766   auto Add2_HiC =
2767       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2768   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2769   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2770 
2771   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2772   Register NumerLo = UnmergeNumer.getReg(0);
2773   Register NumerHi = UnmergeNumer.getReg(1);
2774 
2775   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2776   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2777   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2778   Register Mul3_Lo = UnmergeMul3.getReg(0);
2779   Register Mul3_Hi = UnmergeMul3.getReg(1);
2780   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2781   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2782   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2783   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2784 
2785   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2786   Register DenomLo = UnmergeDenom.getReg(0);
2787   Register DenomHi = UnmergeDenom.getReg(1);
2788 
2789   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2790   auto C1 = B.buildSExt(S32, CmpHi);
2791 
2792   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2793   auto C2 = B.buildSExt(S32, CmpLo);
2794 
2795   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2796   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2797 
2798   // TODO: Here and below portions of the code can be enclosed into if/endif.
2799   // Currently control flow is unconditional and we have 4 selects after
2800   // potential endif to substitute PHIs.
2801 
2802   // if C3 != 0 ...
2803   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2804   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2805   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2806   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2807 
2808   auto One64 = B.buildConstant(S64, 1);
2809   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2810 
2811   auto C4 =
2812       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2813   auto C5 =
2814       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2815   auto C6 = B.buildSelect(
2816       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2817 
2818   // if (C6 != 0)
2819   auto Add4 = B.buildAdd(S64, Add3, One64);
2820   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2821 
2822   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2823   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2824   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2825 
2826   // endif C6
2827   // endif C3
2828 
2829   if (IsDiv) {
2830     auto Sel1 = B.buildSelect(
2831         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2832     B.buildSelect(DstReg,
2833                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2834   } else {
2835     auto Sel2 = B.buildSelect(
2836         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2837     B.buildSelect(DstReg,
2838                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2839   }
2840 }
2841 
2842 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2843                                             MachineRegisterInfo &MRI,
2844                                             MachineIRBuilder &B) const {
2845   const LLT S64 = LLT::scalar(64);
2846   const LLT S32 = LLT::scalar(32);
2847   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2848   Register DstReg = MI.getOperand(0).getReg();
2849   Register Num = MI.getOperand(1).getReg();
2850   Register Den = MI.getOperand(2).getReg();
2851   LLT Ty = MRI.getType(DstReg);
2852 
2853   if (Ty == S32)
2854     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2855   else if (Ty == S64)
2856     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2857   else
2858     return false;
2859 
2860   MI.eraseFromParent();
2861   return true;
2862 
2863 }
2864 
2865 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2866                                             MachineRegisterInfo &MRI,
2867                                             MachineIRBuilder &B) const {
2868   const LLT S64 = LLT::scalar(64);
2869   const LLT S32 = LLT::scalar(32);
2870 
2871   Register DstReg = MI.getOperand(0).getReg();
2872   const LLT Ty = MRI.getType(DstReg);
2873   if (Ty != S32 && Ty != S64)
2874     return false;
2875 
2876   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2877 
2878   Register LHS = MI.getOperand(1).getReg();
2879   Register RHS = MI.getOperand(2).getReg();
2880 
2881   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2882   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2883   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2884 
2885   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2886   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2887 
2888   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2889   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2890 
2891   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2892   if (Ty == S32)
2893     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2894   else
2895     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2896 
2897   Register Sign;
2898   if (IsDiv)
2899     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2900   else
2901     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2902 
2903   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2904   B.buildSub(DstReg, UDivRem, Sign);
2905 
2906   MI.eraseFromParent();
2907   return true;
2908 }
2909 
2910 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2911                                                  MachineRegisterInfo &MRI,
2912                                                  MachineIRBuilder &B) const {
2913   Register Res = MI.getOperand(0).getReg();
2914   Register LHS = MI.getOperand(1).getReg();
2915   Register RHS = MI.getOperand(2).getReg();
2916 
2917   uint16_t Flags = MI.getFlags();
2918 
2919   LLT ResTy = MRI.getType(Res);
2920   LLT S32 = LLT::scalar(32);
2921   LLT S64 = LLT::scalar(64);
2922 
2923   const MachineFunction &MF = B.getMF();
2924   bool Unsafe =
2925     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2926 
2927   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2928     return false;
2929 
2930   if (!Unsafe && ResTy == S32 &&
2931       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2932     return false;
2933 
2934   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2935     // 1 / x -> RCP(x)
2936     if (CLHS->isExactlyValue(1.0)) {
2937       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2938         .addUse(RHS)
2939         .setMIFlags(Flags);
2940 
2941       MI.eraseFromParent();
2942       return true;
2943     }
2944 
2945     // -1 / x -> RCP( FNEG(x) )
2946     if (CLHS->isExactlyValue(-1.0)) {
2947       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2948       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2949         .addUse(FNeg.getReg(0))
2950         .setMIFlags(Flags);
2951 
2952       MI.eraseFromParent();
2953       return true;
2954     }
2955   }
2956 
2957   // x / y -> x * (1.0 / y)
2958   if (Unsafe) {
2959     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2960       .addUse(RHS)
2961       .setMIFlags(Flags);
2962     B.buildFMul(Res, LHS, RCP, Flags);
2963 
2964     MI.eraseFromParent();
2965     return true;
2966   }
2967 
2968   return false;
2969 }
2970 
2971 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2972                                          MachineRegisterInfo &MRI,
2973                                          MachineIRBuilder &B) const {
2974   Register Res = MI.getOperand(0).getReg();
2975   Register LHS = MI.getOperand(1).getReg();
2976   Register RHS = MI.getOperand(2).getReg();
2977 
2978   uint16_t Flags = MI.getFlags();
2979 
2980   LLT S16 = LLT::scalar(16);
2981   LLT S32 = LLT::scalar(32);
2982 
2983   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2984   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2985 
2986   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2987     .addUse(RHSExt.getReg(0))
2988     .setMIFlags(Flags);
2989 
2990   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2991   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2992 
2993   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2994     .addUse(RDst.getReg(0))
2995     .addUse(RHS)
2996     .addUse(LHS)
2997     .setMIFlags(Flags);
2998 
2999   MI.eraseFromParent();
3000   return true;
3001 }
3002 
3003 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3004 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3005 static void toggleSPDenormMode(bool Enable,
3006                                MachineIRBuilder &B,
3007                                const GCNSubtarget &ST,
3008                                AMDGPU::SIModeRegisterDefaults Mode) {
3009   // Set SP denorm mode to this value.
3010   unsigned SPDenormMode =
3011     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3012 
3013   if (ST.hasDenormModeInst()) {
3014     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3015     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3016 
3017     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3018     B.buildInstr(AMDGPU::S_DENORM_MODE)
3019       .addImm(NewDenormModeValue);
3020 
3021   } else {
3022     // Select FP32 bit field in mode register.
3023     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3024                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3025                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3026 
3027     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3028       .addImm(SPDenormMode)
3029       .addImm(SPDenormModeBitField);
3030   }
3031 }
3032 
3033 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3034                                          MachineRegisterInfo &MRI,
3035                                          MachineIRBuilder &B) const {
3036   Register Res = MI.getOperand(0).getReg();
3037   Register LHS = MI.getOperand(1).getReg();
3038   Register RHS = MI.getOperand(2).getReg();
3039   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3040   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3041 
3042   uint16_t Flags = MI.getFlags();
3043 
3044   LLT S32 = LLT::scalar(32);
3045   LLT S1 = LLT::scalar(1);
3046 
3047   auto One = B.buildFConstant(S32, 1.0f);
3048 
3049   auto DenominatorScaled =
3050     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3051       .addUse(LHS)
3052       .addUse(RHS)
3053       .addImm(0)
3054       .setMIFlags(Flags);
3055   auto NumeratorScaled =
3056     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3057       .addUse(LHS)
3058       .addUse(RHS)
3059       .addImm(1)
3060       .setMIFlags(Flags);
3061 
3062   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3063     .addUse(DenominatorScaled.getReg(0))
3064     .setMIFlags(Flags);
3065   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3066 
3067   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3068   // aren't modeled as reading it.
3069   if (!Mode.allFP32Denormals())
3070     toggleSPDenormMode(true, B, ST, Mode);
3071 
3072   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3073   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3074   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3075   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3076   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3077   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3078 
3079   if (!Mode.allFP32Denormals())
3080     toggleSPDenormMode(false, B, ST, Mode);
3081 
3082   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3083     .addUse(Fma4.getReg(0))
3084     .addUse(Fma1.getReg(0))
3085     .addUse(Fma3.getReg(0))
3086     .addUse(NumeratorScaled.getReg(1))
3087     .setMIFlags(Flags);
3088 
3089   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3090     .addUse(Fmas.getReg(0))
3091     .addUse(RHS)
3092     .addUse(LHS)
3093     .setMIFlags(Flags);
3094 
3095   MI.eraseFromParent();
3096   return true;
3097 }
3098 
3099 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3100                                          MachineRegisterInfo &MRI,
3101                                          MachineIRBuilder &B) const {
3102   Register Res = MI.getOperand(0).getReg();
3103   Register LHS = MI.getOperand(1).getReg();
3104   Register RHS = MI.getOperand(2).getReg();
3105 
3106   uint16_t Flags = MI.getFlags();
3107 
3108   LLT S64 = LLT::scalar(64);
3109   LLT S1 = LLT::scalar(1);
3110 
3111   auto One = B.buildFConstant(S64, 1.0);
3112 
3113   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3114     .addUse(LHS)
3115     .addUse(RHS)
3116     .addImm(0)
3117     .setMIFlags(Flags);
3118 
3119   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3120 
3121   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3122     .addUse(DivScale0.getReg(0))
3123     .setMIFlags(Flags);
3124 
3125   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3126   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3127   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3128 
3129   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3130     .addUse(LHS)
3131     .addUse(RHS)
3132     .addImm(1)
3133     .setMIFlags(Flags);
3134 
3135   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3136   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3137   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3138 
3139   Register Scale;
3140   if (!ST.hasUsableDivScaleConditionOutput()) {
3141     // Workaround a hardware bug on SI where the condition output from div_scale
3142     // is not usable.
3143 
3144     LLT S32 = LLT::scalar(32);
3145 
3146     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3147     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3148     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3149     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3150 
3151     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3152                               Scale1Unmerge.getReg(1));
3153     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3154                               Scale0Unmerge.getReg(1));
3155     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3156   } else {
3157     Scale = DivScale1.getReg(1);
3158   }
3159 
3160   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3161     .addUse(Fma4.getReg(0))
3162     .addUse(Fma3.getReg(0))
3163     .addUse(Mul.getReg(0))
3164     .addUse(Scale)
3165     .setMIFlags(Flags);
3166 
3167   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3168     .addUse(Fmas.getReg(0))
3169     .addUse(RHS)
3170     .addUse(LHS)
3171     .setMIFlags(Flags);
3172 
3173   MI.eraseFromParent();
3174   return true;
3175 }
3176 
3177 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3178                                                  MachineRegisterInfo &MRI,
3179                                                  MachineIRBuilder &B) const {
3180   Register Res = MI.getOperand(0).getReg();
3181   Register LHS = MI.getOperand(2).getReg();
3182   Register RHS = MI.getOperand(3).getReg();
3183   uint16_t Flags = MI.getFlags();
3184 
3185   LLT S32 = LLT::scalar(32);
3186   LLT S1 = LLT::scalar(1);
3187 
3188   auto Abs = B.buildFAbs(S32, RHS, Flags);
3189   const APFloat C0Val(1.0f);
3190 
3191   auto C0 = B.buildConstant(S32, 0x6f800000);
3192   auto C1 = B.buildConstant(S32, 0x2f800000);
3193   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3194 
3195   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3196   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3197 
3198   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3199 
3200   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3201     .addUse(Mul0.getReg(0))
3202     .setMIFlags(Flags);
3203 
3204   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3205 
3206   B.buildFMul(Res, Sel, Mul1, Flags);
3207 
3208   MI.eraseFromParent();
3209   return true;
3210 }
3211 
3212 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3213 // FIXME: Why do we handle this one but not other removed instructions?
3214 //
3215 // Reciprocal square root.  The clamp prevents infinite results, clamping
3216 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3217 // +-max_float.
3218 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3219                                                     MachineRegisterInfo &MRI,
3220                                                     MachineIRBuilder &B) const {
3221   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3222     return true;
3223 
3224   Register Dst = MI.getOperand(0).getReg();
3225   Register Src = MI.getOperand(2).getReg();
3226   auto Flags = MI.getFlags();
3227 
3228   LLT Ty = MRI.getType(Dst);
3229 
3230   const fltSemantics *FltSemantics;
3231   if (Ty == LLT::scalar(32))
3232     FltSemantics = &APFloat::IEEEsingle();
3233   else if (Ty == LLT::scalar(64))
3234     FltSemantics = &APFloat::IEEEdouble();
3235   else
3236     return false;
3237 
3238   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3239     .addUse(Src)
3240     .setMIFlags(Flags);
3241 
3242   // We don't need to concern ourselves with the snan handling difference, since
3243   // the rsq quieted (or not) so use the one which will directly select.
3244   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3245   const bool UseIEEE = MFI->getMode().IEEE;
3246 
3247   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3248   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3249                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3250 
3251   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3252 
3253   if (UseIEEE)
3254     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3255   else
3256     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3257   MI.eraseFromParent();
3258   return true;
3259 }
3260 
3261 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3262   switch (IID) {
3263   case Intrinsic::amdgcn_ds_fadd:
3264     return AMDGPU::G_ATOMICRMW_FADD;
3265   case Intrinsic::amdgcn_ds_fmin:
3266     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3267   case Intrinsic::amdgcn_ds_fmax:
3268     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3269   default:
3270     llvm_unreachable("not a DS FP intrinsic");
3271   }
3272 }
3273 
3274 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3275                                                       MachineInstr &MI,
3276                                                       Intrinsic::ID IID) const {
3277   GISelChangeObserver &Observer = Helper.Observer;
3278   Observer.changingInstr(MI);
3279 
3280   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3281 
3282   // The remaining operands were used to set fields in the MemOperand on
3283   // construction.
3284   for (int I = 6; I > 3; --I)
3285     MI.RemoveOperand(I);
3286 
3287   MI.RemoveOperand(1); // Remove the intrinsic ID.
3288   Observer.changedInstr(MI);
3289   return true;
3290 }
3291 
3292 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3293                                             MachineRegisterInfo &MRI,
3294                                             MachineIRBuilder &B) const {
3295   uint64_t Offset =
3296     ST.getTargetLowering()->getImplicitParameterOffset(
3297       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3298   LLT DstTy = MRI.getType(DstReg);
3299   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3300 
3301   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3302   if (!loadInputValue(KernargPtrReg, B,
3303                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3304     return false;
3305 
3306   // FIXME: This should be nuw
3307   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3308   return true;
3309 }
3310 
3311 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3312                                                  MachineRegisterInfo &MRI,
3313                                                  MachineIRBuilder &B) const {
3314   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3315   if (!MFI->isEntryFunction()) {
3316     return legalizePreloadedArgIntrin(MI, MRI, B,
3317                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3318   }
3319 
3320   Register DstReg = MI.getOperand(0).getReg();
3321   if (!getImplicitArgPtr(DstReg, MRI, B))
3322     return false;
3323 
3324   MI.eraseFromParent();
3325   return true;
3326 }
3327 
3328 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3329                                               MachineRegisterInfo &MRI,
3330                                               MachineIRBuilder &B,
3331                                               unsigned AddrSpace) const {
3332   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3333   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3334   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3335   MI.eraseFromParent();
3336   return true;
3337 }
3338 
3339 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3340 // offset (the offset that is included in bounds checking and swizzling, to be
3341 // split between the instruction's voffset and immoffset fields) and soffset
3342 // (the offset that is excluded from bounds checking and swizzling, to go in
3343 // the instruction's soffset field).  This function takes the first kind of
3344 // offset and figures out how to split it between voffset and immoffset.
3345 std::tuple<Register, unsigned, unsigned>
3346 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3347                                         Register OrigOffset) const {
3348   const unsigned MaxImm = 4095;
3349   Register BaseReg;
3350   unsigned TotalConstOffset;
3351   MachineInstr *OffsetDef;
3352   const LLT S32 = LLT::scalar(32);
3353 
3354   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3355     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3356 
3357   unsigned ImmOffset = TotalConstOffset;
3358 
3359   // If the immediate value is too big for the immoffset field, put the value
3360   // and -4096 into the immoffset field so that the value that is copied/added
3361   // for the voffset field is a multiple of 4096, and it stands more chance
3362   // of being CSEd with the copy/add for another similar load/store.
3363   // However, do not do that rounding down to a multiple of 4096 if that is a
3364   // negative number, as it appears to be illegal to have a negative offset
3365   // in the vgpr, even if adding the immediate offset makes it positive.
3366   unsigned Overflow = ImmOffset & ~MaxImm;
3367   ImmOffset -= Overflow;
3368   if ((int32_t)Overflow < 0) {
3369     Overflow += ImmOffset;
3370     ImmOffset = 0;
3371   }
3372 
3373   if (Overflow != 0) {
3374     if (!BaseReg) {
3375       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3376     } else {
3377       auto OverflowVal = B.buildConstant(S32, Overflow);
3378       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3379     }
3380   }
3381 
3382   if (!BaseReg)
3383     BaseReg = B.buildConstant(S32, 0).getReg(0);
3384 
3385   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3386 }
3387 
3388 /// Handle register layout difference for f16 images for some subtargets.
3389 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3390                                              MachineRegisterInfo &MRI,
3391                                              Register Reg) const {
3392   if (!ST.hasUnpackedD16VMem())
3393     return Reg;
3394 
3395   const LLT S16 = LLT::scalar(16);
3396   const LLT S32 = LLT::scalar(32);
3397   LLT StoreVT = MRI.getType(Reg);
3398   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3399 
3400   auto Unmerge = B.buildUnmerge(S16, Reg);
3401 
3402   SmallVector<Register, 4> WideRegs;
3403   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3404     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3405 
3406   int NumElts = StoreVT.getNumElements();
3407 
3408   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3409 }
3410 
3411 Register AMDGPULegalizerInfo::fixStoreSourceType(
3412   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3413   MachineRegisterInfo *MRI = B.getMRI();
3414   LLT Ty = MRI->getType(VData);
3415 
3416   const LLT S16 = LLT::scalar(16);
3417 
3418   // Fixup illegal register types for i8 stores.
3419   if (Ty == LLT::scalar(8) || Ty == S16) {
3420     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3421     return AnyExt;
3422   }
3423 
3424   if (Ty.isVector()) {
3425     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3426       if (IsFormat)
3427         return handleD16VData(B, *MRI, VData);
3428     }
3429   }
3430 
3431   return VData;
3432 }
3433 
3434 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3435                                               MachineRegisterInfo &MRI,
3436                                               MachineIRBuilder &B,
3437                                               bool IsTyped,
3438                                               bool IsFormat) const {
3439   Register VData = MI.getOperand(1).getReg();
3440   LLT Ty = MRI.getType(VData);
3441   LLT EltTy = Ty.getScalarType();
3442   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3443   const LLT S32 = LLT::scalar(32);
3444 
3445   VData = fixStoreSourceType(B, VData, IsFormat);
3446   Register RSrc = MI.getOperand(2).getReg();
3447 
3448   MachineMemOperand *MMO = *MI.memoperands_begin();
3449   const int MemSize = MMO->getSize();
3450 
3451   unsigned ImmOffset;
3452   unsigned TotalOffset;
3453 
3454   // The typed intrinsics add an immediate after the registers.
3455   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3456 
3457   // The struct intrinsic variants add one additional operand over raw.
3458   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3459   Register VIndex;
3460   int OpOffset = 0;
3461   if (HasVIndex) {
3462     VIndex = MI.getOperand(3).getReg();
3463     OpOffset = 1;
3464   }
3465 
3466   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3467   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3468 
3469   unsigned Format = 0;
3470   if (IsTyped) {
3471     Format = MI.getOperand(5 + OpOffset).getImm();
3472     ++OpOffset;
3473   }
3474 
3475   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3476 
3477   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3478   if (TotalOffset != 0)
3479     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3480 
3481   unsigned Opc;
3482   if (IsTyped) {
3483     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3484                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3485   } else if (IsFormat) {
3486     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3487                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3488   } else {
3489     switch (MemSize) {
3490     case 1:
3491       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3492       break;
3493     case 2:
3494       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3495       break;
3496     default:
3497       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3498       break;
3499     }
3500   }
3501 
3502   if (!VIndex)
3503     VIndex = B.buildConstant(S32, 0).getReg(0);
3504 
3505   auto MIB = B.buildInstr(Opc)
3506     .addUse(VData)              // vdata
3507     .addUse(RSrc)               // rsrc
3508     .addUse(VIndex)             // vindex
3509     .addUse(VOffset)            // voffset
3510     .addUse(SOffset)            // soffset
3511     .addImm(ImmOffset);         // offset(imm)
3512 
3513   if (IsTyped)
3514     MIB.addImm(Format);
3515 
3516   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3517      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3518      .addMemOperand(MMO);
3519 
3520   MI.eraseFromParent();
3521   return true;
3522 }
3523 
3524 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3525                                              MachineRegisterInfo &MRI,
3526                                              MachineIRBuilder &B,
3527                                              bool IsFormat,
3528                                              bool IsTyped) const {
3529   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3530   MachineMemOperand *MMO = *MI.memoperands_begin();
3531   const int MemSize = MMO->getSize();
3532   const LLT S32 = LLT::scalar(32);
3533 
3534   Register Dst = MI.getOperand(0).getReg();
3535   Register RSrc = MI.getOperand(2).getReg();
3536 
3537   // The typed intrinsics add an immediate after the registers.
3538   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3539 
3540   // The struct intrinsic variants add one additional operand over raw.
3541   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3542   Register VIndex;
3543   int OpOffset = 0;
3544   if (HasVIndex) {
3545     VIndex = MI.getOperand(3).getReg();
3546     OpOffset = 1;
3547   }
3548 
3549   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3550   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3551 
3552   unsigned Format = 0;
3553   if (IsTyped) {
3554     Format = MI.getOperand(5 + OpOffset).getImm();
3555     ++OpOffset;
3556   }
3557 
3558   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3559   unsigned ImmOffset;
3560   unsigned TotalOffset;
3561 
3562   LLT Ty = MRI.getType(Dst);
3563   LLT EltTy = Ty.getScalarType();
3564   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3565   const bool Unpacked = ST.hasUnpackedD16VMem();
3566 
3567   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3568   if (TotalOffset != 0)
3569     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3570 
3571   unsigned Opc;
3572 
3573   if (IsTyped) {
3574     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3575                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3576   } else if (IsFormat) {
3577     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3578                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3579   } else {
3580     switch (MemSize) {
3581     case 1:
3582       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3583       break;
3584     case 2:
3585       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3586       break;
3587     default:
3588       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3589       break;
3590     }
3591   }
3592 
3593   Register LoadDstReg;
3594 
3595   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3596   LLT UnpackedTy = Ty.changeElementSize(32);
3597 
3598   if (IsExtLoad)
3599     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3600   else if (Unpacked && IsD16 && Ty.isVector())
3601     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3602   else
3603     LoadDstReg = Dst;
3604 
3605   if (!VIndex)
3606     VIndex = B.buildConstant(S32, 0).getReg(0);
3607 
3608   auto MIB = B.buildInstr(Opc)
3609     .addDef(LoadDstReg)         // vdata
3610     .addUse(RSrc)               // rsrc
3611     .addUse(VIndex)             // vindex
3612     .addUse(VOffset)            // voffset
3613     .addUse(SOffset)            // soffset
3614     .addImm(ImmOffset);         // offset(imm)
3615 
3616   if (IsTyped)
3617     MIB.addImm(Format);
3618 
3619   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3620      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3621      .addMemOperand(MMO);
3622 
3623   if (LoadDstReg != Dst) {
3624     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3625 
3626     // Widen result for extending loads was widened.
3627     if (IsExtLoad)
3628       B.buildTrunc(Dst, LoadDstReg);
3629     else {
3630       // Repack to original 16-bit vector result
3631       // FIXME: G_TRUNC should work, but legalization currently fails
3632       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3633       SmallVector<Register, 4> Repack;
3634       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3635         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3636       B.buildMerge(Dst, Repack);
3637     }
3638   }
3639 
3640   MI.eraseFromParent();
3641   return true;
3642 }
3643 
3644 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3645                                                MachineIRBuilder &B,
3646                                                bool IsInc) const {
3647   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3648                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3649   B.buildInstr(Opc)
3650     .addDef(MI.getOperand(0).getReg())
3651     .addUse(MI.getOperand(2).getReg())
3652     .addUse(MI.getOperand(3).getReg())
3653     .cloneMemRefs(MI);
3654   MI.eraseFromParent();
3655   return true;
3656 }
3657 
3658 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3659   switch (IntrID) {
3660   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3661   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3662     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3663   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3664   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3665     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3666   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3667   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3668     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3669   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3670   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3671     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3672   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3673   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3674     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3675   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3676   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3677     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3678   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3679   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3680     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3681   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3682   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3683     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3684   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3685   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3686     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3687   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3688   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3689     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3690   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3691   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3692     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3693   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3694   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3695     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3696   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3697   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3698     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3699   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3700   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3701     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3702   default:
3703     llvm_unreachable("unhandled atomic opcode");
3704   }
3705 }
3706 
3707 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3708                                                MachineIRBuilder &B,
3709                                                Intrinsic::ID IID) const {
3710   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3711                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3712   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3713 
3714   Register Dst;
3715 
3716   int OpOffset = 0;
3717   if (HasReturn) {
3718     // A few FP atomics do not support return values.
3719     Dst = MI.getOperand(0).getReg();
3720   } else {
3721     OpOffset = -1;
3722   }
3723 
3724   Register VData = MI.getOperand(2 + OpOffset).getReg();
3725   Register CmpVal;
3726 
3727   if (IsCmpSwap) {
3728     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3729     ++OpOffset;
3730   }
3731 
3732   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3733   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3734 
3735   // The struct intrinsic variants add one additional operand over raw.
3736   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3737   Register VIndex;
3738   if (HasVIndex) {
3739     VIndex = MI.getOperand(4 + OpOffset).getReg();
3740     ++OpOffset;
3741   }
3742 
3743   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3744   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3745   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3746 
3747   MachineMemOperand *MMO = *MI.memoperands_begin();
3748 
3749   unsigned ImmOffset;
3750   unsigned TotalOffset;
3751   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3752   if (TotalOffset != 0)
3753     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3754 
3755   if (!VIndex)
3756     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3757 
3758   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3759 
3760   if (HasReturn)
3761     MIB.addDef(Dst);
3762 
3763   MIB.addUse(VData); // vdata
3764 
3765   if (IsCmpSwap)
3766     MIB.addReg(CmpVal);
3767 
3768   MIB.addUse(RSrc)               // rsrc
3769      .addUse(VIndex)             // vindex
3770      .addUse(VOffset)            // voffset
3771      .addUse(SOffset)            // soffset
3772      .addImm(ImmOffset)          // offset(imm)
3773      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3774      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3775      .addMemOperand(MMO);
3776 
3777   MI.eraseFromParent();
3778   return true;
3779 }
3780 
3781 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3782 /// vector with s16 typed elements.
3783 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3784                                         SmallVectorImpl<Register> &PackedAddrs,
3785                                         int AddrIdx, int DimIdx, int EndIdx,
3786                                         int NumGradients) {
3787   const LLT S16 = LLT::scalar(16);
3788   const LLT V2S16 = LLT::vector(2, 16);
3789 
3790   for (int I = AddrIdx; I < EndIdx; ++I) {
3791     MachineOperand &SrcOp = MI.getOperand(I);
3792     if (!SrcOp.isReg())
3793       continue; // _L to _LZ may have eliminated this.
3794 
3795     Register AddrReg = SrcOp.getReg();
3796 
3797     if (I < DimIdx) {
3798       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3799       PackedAddrs.push_back(AddrReg);
3800     } else {
3801       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3802       // derivatives dx/dh and dx/dv are packed with undef.
3803       if (((I + 1) >= EndIdx) ||
3804           ((NumGradients / 2) % 2 == 1 &&
3805            (I == DimIdx + (NumGradients / 2) - 1 ||
3806             I == DimIdx + NumGradients - 1)) ||
3807           // Check for _L to _LZ optimization
3808           !MI.getOperand(I + 1).isReg()) {
3809         PackedAddrs.push_back(
3810             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3811                 .getReg(0));
3812       } else {
3813         PackedAddrs.push_back(
3814             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3815                 .getReg(0));
3816         ++I;
3817       }
3818     }
3819   }
3820 }
3821 
3822 /// Convert from separate vaddr components to a single vector address register,
3823 /// and replace the remaining operands with $noreg.
3824 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3825                                      int DimIdx, int NumVAddrs) {
3826   const LLT S32 = LLT::scalar(32);
3827 
3828   SmallVector<Register, 8> AddrRegs;
3829   for (int I = 0; I != NumVAddrs; ++I) {
3830     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3831     if (SrcOp.isReg()) {
3832       AddrRegs.push_back(SrcOp.getReg());
3833       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3834     }
3835   }
3836 
3837   int NumAddrRegs = AddrRegs.size();
3838   if (NumAddrRegs != 1) {
3839     // Round up to 8 elements for v5-v7
3840     // FIXME: Missing intermediate sized register classes and instructions.
3841     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3842       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3843       auto Undef = B.buildUndef(S32);
3844       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3845       NumAddrRegs = RoundedNumRegs;
3846     }
3847 
3848     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3849     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3850   }
3851 
3852   for (int I = 1; I != NumVAddrs; ++I) {
3853     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3854     if (SrcOp.isReg())
3855       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3856   }
3857 }
3858 
3859 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3860 ///
3861 /// Depending on the subtarget, load/store with 16-bit element data need to be
3862 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3863 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3864 /// registers.
3865 ///
3866 /// We don't want to directly select image instructions just yet, but also want
3867 /// to exposes all register repacking to the legalizer/combiners. We also don't
3868 /// want a selected instrution entering RegBankSelect. In order to avoid
3869 /// defining a multitude of intermediate image instructions, directly hack on
3870 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3871 /// now unnecessary arguments with $noreg.
3872 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3873     MachineInstr &MI, MachineIRBuilder &B,
3874     GISelChangeObserver &Observer,
3875     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3876 
3877   const int NumDefs = MI.getNumExplicitDefs();
3878   bool IsTFE = NumDefs == 2;
3879   // We are only processing the operands of d16 image operations on subtargets
3880   // that use the unpacked register layout, or need to repack the TFE result.
3881 
3882   // TODO: Do we need to guard against already legalized intrinsics?
3883   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3884     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3885 
3886   MachineRegisterInfo *MRI = B.getMRI();
3887   const LLT S32 = LLT::scalar(32);
3888   const LLT S16 = LLT::scalar(16);
3889   const LLT V2S16 = LLT::vector(2, 16);
3890 
3891   // Index of first address argument
3892   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3893 
3894   int NumVAddrs, NumGradients;
3895   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3896   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3897     getDMaskIdx(BaseOpcode, NumDefs);
3898   unsigned DMask = 0;
3899 
3900   // Check for 16 bit addresses and pack if true.
3901   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3902   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3903   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3904   const bool IsG16 = GradTy == S16;
3905   const bool IsA16 = AddrTy == S16;
3906 
3907   int DMaskLanes = 0;
3908   if (!BaseOpcode->Atomic) {
3909     DMask = MI.getOperand(DMaskIdx).getImm();
3910     if (BaseOpcode->Gather4) {
3911       DMaskLanes = 4;
3912     } else if (DMask != 0) {
3913       DMaskLanes = countPopulation(DMask);
3914     } else if (!IsTFE && !BaseOpcode->Store) {
3915       // If dmask is 0, this is a no-op load. This can be eliminated.
3916       B.buildUndef(MI.getOperand(0));
3917       MI.eraseFromParent();
3918       return true;
3919     }
3920   }
3921 
3922   Observer.changingInstr(MI);
3923   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3924 
3925   unsigned NewOpcode = NumDefs == 0 ?
3926     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3927 
3928   // Track that we legalized this
3929   MI.setDesc(B.getTII().get(NewOpcode));
3930 
3931   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3932   // dmask to be at least 1 otherwise the instruction will fail
3933   if (IsTFE && DMask == 0) {
3934     DMask = 0x1;
3935     DMaskLanes = 1;
3936     MI.getOperand(DMaskIdx).setImm(DMask);
3937   }
3938 
3939   if (BaseOpcode->Atomic) {
3940     Register VData0 = MI.getOperand(2).getReg();
3941     LLT Ty = MRI->getType(VData0);
3942 
3943     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3944     if (Ty.isVector())
3945       return false;
3946 
3947     if (BaseOpcode->AtomicX2) {
3948       Register VData1 = MI.getOperand(3).getReg();
3949       // The two values are packed in one register.
3950       LLT PackedTy = LLT::vector(2, Ty);
3951       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3952       MI.getOperand(2).setReg(Concat.getReg(0));
3953       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3954     }
3955   }
3956 
3957   int CorrectedNumVAddrs = NumVAddrs;
3958 
3959   // Optimize _L to _LZ when _L is zero
3960   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3961         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3962     const ConstantFP *ConstantLod;
3963     const int LodIdx = AddrIdx + NumVAddrs - 1;
3964 
3965     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3966       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3967         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3968         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3969           LZMappingInfo->LZ, ImageDimIntr->Dim);
3970 
3971         // The starting indexes should remain in the same place.
3972         --NumVAddrs;
3973         --CorrectedNumVAddrs;
3974 
3975         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3976           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3977         MI.RemoveOperand(LodIdx);
3978       }
3979     }
3980   }
3981 
3982   // Optimize _mip away, when 'lod' is zero
3983   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3984     int64_t ConstantLod;
3985     const int LodIdx = AddrIdx + NumVAddrs - 1;
3986 
3987     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3988       if (ConstantLod == 0) {
3989         // TODO: Change intrinsic opcode and remove operand instead or replacing
3990         // it with 0, as the _L to _LZ handling is done above.
3991         MI.getOperand(LodIdx).ChangeToImmediate(0);
3992         --CorrectedNumVAddrs;
3993       }
3994     }
3995   }
3996 
3997   // Rewrite the addressing register layout before doing anything else.
3998   if (IsA16 || IsG16) {
3999     if (IsA16) {
4000       // Target must support the feature and gradients need to be 16 bit too
4001       if (!ST.hasA16() || !IsG16)
4002         return false;
4003     } else if (!ST.hasG16())
4004       return false;
4005 
4006     if (NumVAddrs > 1) {
4007       SmallVector<Register, 4> PackedRegs;
4008       // Don't compress addresses for G16
4009       const int PackEndIdx =
4010           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4011       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4012                                   PackEndIdx, NumGradients);
4013 
4014       if (!IsA16) {
4015         // Add uncompressed address
4016         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4017           int AddrReg = MI.getOperand(I).getReg();
4018           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4019           PackedRegs.push_back(AddrReg);
4020         }
4021       }
4022 
4023       // See also below in the non-a16 branch
4024       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4025 
4026       if (!UseNSA && PackedRegs.size() > 1) {
4027         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4028         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4029         PackedRegs[0] = Concat.getReg(0);
4030         PackedRegs.resize(1);
4031       }
4032 
4033       const int NumPacked = PackedRegs.size();
4034       for (int I = 0; I != NumVAddrs; ++I) {
4035         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4036         if (!SrcOp.isReg()) {
4037           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4038           continue;
4039         }
4040 
4041         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4042 
4043         if (I < NumPacked)
4044           SrcOp.setReg(PackedRegs[I]);
4045         else
4046           SrcOp.setReg(AMDGPU::NoRegister);
4047       }
4048     }
4049   } else {
4050     // If the register allocator cannot place the address registers contiguously
4051     // without introducing moves, then using the non-sequential address encoding
4052     // is always preferable, since it saves VALU instructions and is usually a
4053     // wash in terms of code size or even better.
4054     //
4055     // However, we currently have no way of hinting to the register allocator
4056     // that MIMG addresses should be placed contiguously when it is possible to
4057     // do so, so force non-NSA for the common 2-address case as a heuristic.
4058     //
4059     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4060     // allocation when possible.
4061     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4062 
4063     if (!UseNSA && NumVAddrs > 1)
4064       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4065   }
4066 
4067   int Flags = 0;
4068   if (IsA16)
4069     Flags |= 1;
4070   if (IsG16)
4071     Flags |= 2;
4072   MI.addOperand(MachineOperand::CreateImm(Flags));
4073 
4074   if (BaseOpcode->Store) { // No TFE for stores?
4075     // TODO: Handle dmask trim
4076     Register VData = MI.getOperand(1).getReg();
4077     LLT Ty = MRI->getType(VData);
4078     if (!Ty.isVector() || Ty.getElementType() != S16)
4079       return true;
4080 
4081     Register RepackedReg = handleD16VData(B, *MRI, VData);
4082     if (RepackedReg != VData) {
4083       MI.getOperand(1).setReg(RepackedReg);
4084     }
4085 
4086     return true;
4087   }
4088 
4089   Register DstReg = MI.getOperand(0).getReg();
4090   LLT Ty = MRI->getType(DstReg);
4091   const LLT EltTy = Ty.getScalarType();
4092   const bool IsD16 = Ty.getScalarType() == S16;
4093   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4094 
4095   // Confirm that the return type is large enough for the dmask specified
4096   if (NumElts < DMaskLanes)
4097     return false;
4098 
4099   if (NumElts > 4 || DMaskLanes > 4)
4100     return false;
4101 
4102   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4103   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4104 
4105   // The raw dword aligned data component of the load. The only legal cases
4106   // where this matters should be when using the packed D16 format, for
4107   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4108   LLT RoundedTy;
4109 
4110   // S32 vector to to cover all data, plus TFE result element.
4111   LLT TFETy;
4112 
4113   // Register type to use for each loaded component. Will be S32 or V2S16.
4114   LLT RegTy;
4115 
4116   if (IsD16 && ST.hasUnpackedD16VMem()) {
4117     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4118     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4119     RegTy = S32;
4120   } else {
4121     unsigned EltSize = EltTy.getSizeInBits();
4122     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4123     unsigned RoundedSize = 32 * RoundedElts;
4124     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4125     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4126     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4127   }
4128 
4129   // The return type does not need adjustment.
4130   // TODO: Should we change s16 case to s32 or <2 x s16>?
4131   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4132     return true;
4133 
4134   Register Dst1Reg;
4135 
4136   // Insert after the instruction.
4137   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4138 
4139   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4140   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4141   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4142   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4143 
4144   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4145 
4146   MI.getOperand(0).setReg(NewResultReg);
4147 
4148   // In the IR, TFE is supposed to be used with a 2 element struct return
4149   // type. The intruction really returns these two values in one contiguous
4150   // register, with one additional dword beyond the loaded data. Rewrite the
4151   // return type to use a single register result.
4152 
4153   if (IsTFE) {
4154     Dst1Reg = MI.getOperand(1).getReg();
4155     if (MRI->getType(Dst1Reg) != S32)
4156       return false;
4157 
4158     // TODO: Make sure the TFE operand bit is set.
4159     MI.RemoveOperand(1);
4160 
4161     // Handle the easy case that requires no repack instructions.
4162     if (Ty == S32) {
4163       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4164       return true;
4165     }
4166   }
4167 
4168   // Now figure out how to copy the new result register back into the old
4169   // result.
4170   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4171 
4172   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4173 
4174   if (ResultNumRegs == 1) {
4175     assert(!IsTFE);
4176     ResultRegs[0] = NewResultReg;
4177   } else {
4178     // We have to repack into a new vector of some kind.
4179     for (int I = 0; I != NumDataRegs; ++I)
4180       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4181     B.buildUnmerge(ResultRegs, NewResultReg);
4182 
4183     // Drop the final TFE element to get the data part. The TFE result is
4184     // directly written to the right place already.
4185     if (IsTFE)
4186       ResultRegs.resize(NumDataRegs);
4187   }
4188 
4189   // For an s16 scalar result, we form an s32 result with a truncate regardless
4190   // of packed vs. unpacked.
4191   if (IsD16 && !Ty.isVector()) {
4192     B.buildTrunc(DstReg, ResultRegs[0]);
4193     return true;
4194   }
4195 
4196   // Avoid a build/concat_vector of 1 entry.
4197   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4198     B.buildBitcast(DstReg, ResultRegs[0]);
4199     return true;
4200   }
4201 
4202   assert(Ty.isVector());
4203 
4204   if (IsD16) {
4205     // For packed D16 results with TFE enabled, all the data components are
4206     // S32. Cast back to the expected type.
4207     //
4208     // TODO: We don't really need to use load s32 elements. We would only need one
4209     // cast for the TFE result if a multiple of v2s16 was used.
4210     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4211       for (Register &Reg : ResultRegs)
4212         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4213     } else if (ST.hasUnpackedD16VMem()) {
4214       for (Register &Reg : ResultRegs)
4215         Reg = B.buildTrunc(S16, Reg).getReg(0);
4216     }
4217   }
4218 
4219   auto padWithUndef = [&](LLT Ty, int NumElts) {
4220     if (NumElts == 0)
4221       return;
4222     Register Undef = B.buildUndef(Ty).getReg(0);
4223     for (int I = 0; I != NumElts; ++I)
4224       ResultRegs.push_back(Undef);
4225   };
4226 
4227   // Pad out any elements eliminated due to the dmask.
4228   LLT ResTy = MRI->getType(ResultRegs[0]);
4229   if (!ResTy.isVector()) {
4230     padWithUndef(ResTy, NumElts - ResultRegs.size());
4231     B.buildBuildVector(DstReg, ResultRegs);
4232     return true;
4233   }
4234 
4235   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4236   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4237 
4238   // Deal with the one annoying legal case.
4239   const LLT V3S16 = LLT::vector(3, 16);
4240   if (Ty == V3S16) {
4241     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4242     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4243     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4244     return true;
4245   }
4246 
4247   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4248   B.buildConcatVectors(DstReg, ResultRegs);
4249   return true;
4250 }
4251 
4252 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4253   LegalizerHelper &Helper, MachineInstr &MI) const {
4254   MachineIRBuilder &B = Helper.MIRBuilder;
4255   GISelChangeObserver &Observer = Helper.Observer;
4256 
4257   Register Dst = MI.getOperand(0).getReg();
4258   LLT Ty = B.getMRI()->getType(Dst);
4259   unsigned Size = Ty.getSizeInBits();
4260   MachineFunction &MF = B.getMF();
4261 
4262   Observer.changingInstr(MI);
4263 
4264   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4265     Ty = getBitcastRegisterType(Ty);
4266     Helper.bitcastDst(MI, Ty, 0);
4267     Dst = MI.getOperand(0).getReg();
4268     B.setInsertPt(B.getMBB(), MI);
4269   }
4270 
4271   // FIXME: We don't really need this intermediate instruction. The intrinsic
4272   // should be fixed to have a memory operand. Since it's readnone, we're not
4273   // allowed to add one.
4274   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4275   MI.RemoveOperand(1); // Remove intrinsic ID
4276 
4277   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4278   // TODO: Should this use datalayout alignment?
4279   const unsigned MemSize = (Size + 7) / 8;
4280   const Align MemAlign(4);
4281   MachineMemOperand *MMO = MF.getMachineMemOperand(
4282       MachinePointerInfo(),
4283       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4284           MachineMemOperand::MOInvariant,
4285       MemSize, MemAlign);
4286   MI.addMemOperand(MF, MMO);
4287 
4288   // There are no 96-bit result scalar loads, but widening to 128-bit should
4289   // always be legal. We may need to restore this to a 96-bit result if it turns
4290   // out this needs to be converted to a vector load during RegBankSelect.
4291   if (!isPowerOf2_32(Size)) {
4292     if (Ty.isVector())
4293       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4294     else
4295       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4296   }
4297 
4298   Observer.changedInstr(MI);
4299   return true;
4300 }
4301 
4302 // TODO: Move to selection
4303 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4304                                                 MachineRegisterInfo &MRI,
4305                                                 MachineIRBuilder &B) const {
4306   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4307   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4308       !ST.isTrapHandlerEnabled()) {
4309     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4310   } else {
4311     // Pass queue pointer to trap handler as input, and insert trap instruction
4312     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4313     MachineRegisterInfo &MRI = *B.getMRI();
4314 
4315     Register LiveIn =
4316       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4317     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4318       return false;
4319 
4320     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4321     B.buildCopy(SGPR01, LiveIn);
4322     B.buildInstr(AMDGPU::S_TRAP)
4323         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4324         .addReg(SGPR01, RegState::Implicit);
4325   }
4326 
4327   MI.eraseFromParent();
4328   return true;
4329 }
4330 
4331 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4332     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4333   // Is non-HSA path or trap-handler disabled? then, report a warning
4334   // accordingly
4335   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4336       !ST.isTrapHandlerEnabled()) {
4337     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4338                                      "debugtrap handler not supported",
4339                                      MI.getDebugLoc(), DS_Warning);
4340     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4341     Ctx.diagnose(NoTrap);
4342   } else {
4343     // Insert debug-trap instruction
4344     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4345   }
4346 
4347   MI.eraseFromParent();
4348   return true;
4349 }
4350 
4351 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4352                                             MachineInstr &MI) const {
4353   MachineIRBuilder &B = Helper.MIRBuilder;
4354   MachineRegisterInfo &MRI = *B.getMRI();
4355 
4356   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4357   auto IntrID = MI.getIntrinsicID();
4358   switch (IntrID) {
4359   case Intrinsic::amdgcn_if:
4360   case Intrinsic::amdgcn_else: {
4361     MachineInstr *Br = nullptr;
4362     MachineBasicBlock *UncondBrTarget = nullptr;
4363     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4364       const SIRegisterInfo *TRI
4365         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4366 
4367       Register Def = MI.getOperand(1).getReg();
4368       Register Use = MI.getOperand(3).getReg();
4369 
4370       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4371       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4372       if (IntrID == Intrinsic::amdgcn_if) {
4373         B.buildInstr(AMDGPU::SI_IF)
4374           .addDef(Def)
4375           .addUse(Use)
4376           .addMBB(UncondBrTarget);
4377       } else {
4378         B.buildInstr(AMDGPU::SI_ELSE)
4379           .addDef(Def)
4380           .addUse(Use)
4381           .addMBB(UncondBrTarget)
4382           .addImm(0);
4383       }
4384 
4385       if (Br) {
4386         Br->getOperand(0).setMBB(CondBrTarget);
4387       } else {
4388         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4389         // since we're swapping branch targets it needs to be reinserted.
4390         // FIXME: IRTranslator should probably not do this
4391         B.buildBr(*CondBrTarget);
4392       }
4393 
4394       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4395       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4396       MI.eraseFromParent();
4397       BrCond->eraseFromParent();
4398       return true;
4399     }
4400 
4401     return false;
4402   }
4403   case Intrinsic::amdgcn_loop: {
4404     MachineInstr *Br = nullptr;
4405     MachineBasicBlock *UncondBrTarget = nullptr;
4406     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4407       const SIRegisterInfo *TRI
4408         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4409 
4410       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4411       Register Reg = MI.getOperand(2).getReg();
4412 
4413       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4414       B.buildInstr(AMDGPU::SI_LOOP)
4415         .addUse(Reg)
4416         .addMBB(UncondBrTarget);
4417 
4418       if (Br)
4419         Br->getOperand(0).setMBB(CondBrTarget);
4420       else
4421         B.buildBr(*CondBrTarget);
4422 
4423       MI.eraseFromParent();
4424       BrCond->eraseFromParent();
4425       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4426       return true;
4427     }
4428 
4429     return false;
4430   }
4431   case Intrinsic::amdgcn_kernarg_segment_ptr:
4432     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4433       // This only makes sense to call in a kernel, so just lower to null.
4434       B.buildConstant(MI.getOperand(0).getReg(), 0);
4435       MI.eraseFromParent();
4436       return true;
4437     }
4438 
4439     return legalizePreloadedArgIntrin(
4440       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4441   case Intrinsic::amdgcn_implicitarg_ptr:
4442     return legalizeImplicitArgPtr(MI, MRI, B);
4443   case Intrinsic::amdgcn_workitem_id_x:
4444     return legalizePreloadedArgIntrin(MI, MRI, B,
4445                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4446   case Intrinsic::amdgcn_workitem_id_y:
4447     return legalizePreloadedArgIntrin(MI, MRI, B,
4448                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4449   case Intrinsic::amdgcn_workitem_id_z:
4450     return legalizePreloadedArgIntrin(MI, MRI, B,
4451                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4452   case Intrinsic::amdgcn_workgroup_id_x:
4453     return legalizePreloadedArgIntrin(MI, MRI, B,
4454                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4455   case Intrinsic::amdgcn_workgroup_id_y:
4456     return legalizePreloadedArgIntrin(MI, MRI, B,
4457                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4458   case Intrinsic::amdgcn_workgroup_id_z:
4459     return legalizePreloadedArgIntrin(MI, MRI, B,
4460                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4461   case Intrinsic::amdgcn_dispatch_ptr:
4462     return legalizePreloadedArgIntrin(MI, MRI, B,
4463                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4464   case Intrinsic::amdgcn_queue_ptr:
4465     return legalizePreloadedArgIntrin(MI, MRI, B,
4466                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4467   case Intrinsic::amdgcn_implicit_buffer_ptr:
4468     return legalizePreloadedArgIntrin(
4469       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4470   case Intrinsic::amdgcn_dispatch_id:
4471     return legalizePreloadedArgIntrin(MI, MRI, B,
4472                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4473   case Intrinsic::amdgcn_fdiv_fast:
4474     return legalizeFDIVFastIntrin(MI, MRI, B);
4475   case Intrinsic::amdgcn_is_shared:
4476     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4477   case Intrinsic::amdgcn_is_private:
4478     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4479   case Intrinsic::amdgcn_wavefrontsize: {
4480     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4481     MI.eraseFromParent();
4482     return true;
4483   }
4484   case Intrinsic::amdgcn_s_buffer_load:
4485     return legalizeSBufferLoad(Helper, MI);
4486   case Intrinsic::amdgcn_raw_buffer_store:
4487   case Intrinsic::amdgcn_struct_buffer_store:
4488     return legalizeBufferStore(MI, MRI, B, false, false);
4489   case Intrinsic::amdgcn_raw_buffer_store_format:
4490   case Intrinsic::amdgcn_struct_buffer_store_format:
4491     return legalizeBufferStore(MI, MRI, B, false, true);
4492   case Intrinsic::amdgcn_raw_tbuffer_store:
4493   case Intrinsic::amdgcn_struct_tbuffer_store:
4494     return legalizeBufferStore(MI, MRI, B, true, true);
4495   case Intrinsic::amdgcn_raw_buffer_load:
4496   case Intrinsic::amdgcn_struct_buffer_load:
4497     return legalizeBufferLoad(MI, MRI, B, false, false);
4498   case Intrinsic::amdgcn_raw_buffer_load_format:
4499   case Intrinsic::amdgcn_struct_buffer_load_format:
4500     return legalizeBufferLoad(MI, MRI, B, true, false);
4501   case Intrinsic::amdgcn_raw_tbuffer_load:
4502   case Intrinsic::amdgcn_struct_tbuffer_load:
4503     return legalizeBufferLoad(MI, MRI, B, true, true);
4504   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4505   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4506   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4507   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4508   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4509   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4510   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4511   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4512   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4513   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4514   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4515   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4516   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4517   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4518   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4519   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4520   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4521   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4522   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4523   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4524   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4525   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4526   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4527   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4528   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4529   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4530   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4531   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4532     return legalizeBufferAtomic(MI, B, IntrID);
4533   case Intrinsic::amdgcn_atomic_inc:
4534     return legalizeAtomicIncDec(MI, B, true);
4535   case Intrinsic::amdgcn_atomic_dec:
4536     return legalizeAtomicIncDec(MI, B, false);
4537   case Intrinsic::trap:
4538     return legalizeTrapIntrinsic(MI, MRI, B);
4539   case Intrinsic::debugtrap:
4540     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4541   case Intrinsic::amdgcn_rsq_clamp:
4542     return legalizeRsqClampIntrinsic(MI, MRI, B);
4543   case Intrinsic::amdgcn_ds_fadd:
4544   case Intrinsic::amdgcn_ds_fmin:
4545   case Intrinsic::amdgcn_ds_fmax:
4546     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4547   default: {
4548     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4549             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4550       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4551     return true;
4552   }
4553   }
4554 
4555   return true;
4556 }
4557