1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
364                                          const GCNTargetMachine &TM)
365   :  ST(ST_) {
366   using namespace TargetOpcode;
367 
368   auto GetAddrSpacePtr = [&TM](unsigned AS) {
369     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
370   };
371 
372   const LLT S1 = LLT::scalar(1);
373   const LLT S16 = LLT::scalar(16);
374   const LLT S32 = LLT::scalar(32);
375   const LLT S64 = LLT::scalar(64);
376   const LLT S128 = LLT::scalar(128);
377   const LLT S256 = LLT::scalar(256);
378   const LLT S512 = LLT::scalar(512);
379   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
380 
381   const LLT V2S16 = LLT::vector(2, 16);
382   const LLT V4S16 = LLT::vector(4, 16);
383 
384   const LLT V2S32 = LLT::vector(2, 32);
385   const LLT V3S32 = LLT::vector(3, 32);
386   const LLT V4S32 = LLT::vector(4, 32);
387   const LLT V5S32 = LLT::vector(5, 32);
388   const LLT V6S32 = LLT::vector(6, 32);
389   const LLT V7S32 = LLT::vector(7, 32);
390   const LLT V8S32 = LLT::vector(8, 32);
391   const LLT V9S32 = LLT::vector(9, 32);
392   const LLT V10S32 = LLT::vector(10, 32);
393   const LLT V11S32 = LLT::vector(11, 32);
394   const LLT V12S32 = LLT::vector(12, 32);
395   const LLT V13S32 = LLT::vector(13, 32);
396   const LLT V14S32 = LLT::vector(14, 32);
397   const LLT V15S32 = LLT::vector(15, 32);
398   const LLT V16S32 = LLT::vector(16, 32);
399   const LLT V32S32 = LLT::vector(32, 32);
400 
401   const LLT V2S64 = LLT::vector(2, 64);
402   const LLT V3S64 = LLT::vector(3, 64);
403   const LLT V4S64 = LLT::vector(4, 64);
404   const LLT V5S64 = LLT::vector(5, 64);
405   const LLT V6S64 = LLT::vector(6, 64);
406   const LLT V7S64 = LLT::vector(7, 64);
407   const LLT V8S64 = LLT::vector(8, 64);
408   const LLT V16S64 = LLT::vector(16, 64);
409 
410   std::initializer_list<LLT> AllS32Vectors =
411     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
412      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
413   std::initializer_list<LLT> AllS64Vectors =
414     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
415 
416   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
417   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
418   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
419   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
420   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
421   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
422   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
423 
424   const LLT CodePtr = FlatPtr;
425 
426   const std::initializer_list<LLT> AddrSpaces64 = {
427     GlobalPtr, ConstantPtr, FlatPtr
428   };
429 
430   const std::initializer_list<LLT> AddrSpaces32 = {
431     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
432   };
433 
434   const std::initializer_list<LLT> FPTypesBase = {
435     S32, S64
436   };
437 
438   const std::initializer_list<LLT> FPTypes16 = {
439     S32, S64, S16
440   };
441 
442   const std::initializer_list<LLT> FPTypesPK16 = {
443     S32, S64, S16, V2S16
444   };
445 
446   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
447 
448   setAction({G_BRCOND, S1}, Legal); // VCC branches
449   setAction({G_BRCOND, S32}, Legal); // SCC branches
450 
451   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
452   // elements for v3s16
453   getActionDefinitionsBuilder(G_PHI)
454     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
455     .legalFor(AllS32Vectors)
456     .legalFor(AllS64Vectors)
457     .legalFor(AddrSpaces64)
458     .legalFor(AddrSpaces32)
459     .legalIf(isPointer(0))
460     .clampScalar(0, S16, S256)
461     .widenScalarToNextPow2(0, 32)
462     .clampMaxNumElements(0, S32, 16)
463     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
464     .scalarize(0);
465 
466   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
467     // Full set of gfx9 features.
468     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
469       .legalFor({S32, S16, V2S16})
470       .clampScalar(0, S16, S32)
471       .clampMaxNumElements(0, S16, 2)
472       .scalarize(0)
473       .widenScalarToNextPow2(0, 32);
474 
475     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
476       .legalFor({S32, S16, V2S16}) // Clamp modifier
477       .minScalarOrElt(0, S16)
478       .clampMaxNumElements(0, S16, 2)
479       .scalarize(0)
480       .widenScalarToNextPow2(0, 32)
481       .lower();
482   } else if (ST.has16BitInsts()) {
483     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
484       .legalFor({S32, S16})
485       .clampScalar(0, S16, S32)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
488 
489     // Technically the saturating operations require clamp bit support, but this
490     // was introduced at the same time as 16-bit operations.
491     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
492       .legalFor({S32, S16}) // Clamp modifier
493       .minScalar(0, S16)
494       .scalarize(0)
495       .widenScalarToNextPow2(0, 16)
496       .lower();
497 
498     // We're just lowering this, but it helps get a better result to try to
499     // coerce to the desired type first.
500     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
501       .minScalar(0, S16)
502       .scalarize(0)
503       .lower();
504   } else {
505     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
506       .legalFor({S32})
507       .clampScalar(0, S32, S32)
508       .scalarize(0);
509 
510     if (ST.hasIntClamp()) {
511       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
512         .legalFor({S32}) // Clamp modifier.
513         .scalarize(0)
514         .minScalarOrElt(0, S32)
515         .lower();
516     } else {
517       // Clamp bit support was added in VI, along with 16-bit operations.
518       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
519         .minScalar(0, S32)
520         .scalarize(0)
521         .lower();
522     }
523 
524     // FIXME: DAG expansion gets better results. The widening uses the smaller
525     // range values and goes for the min/max lowering directly.
526     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
527       .minScalar(0, S32)
528       .scalarize(0)
529       .lower();
530   }
531 
532   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
533     .customFor({S32, S64})
534     .clampScalar(0, S32, S64)
535     .widenScalarToNextPow2(0, 32)
536     .scalarize(0);
537 
538   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
539     .legalFor({S32})
540     .clampScalar(0, S32, S32)
541     .scalarize(0);
542 
543   // Report legal for any types we can handle anywhere. For the cases only legal
544   // on the SALU, RegBankSelect will be able to re-legalize.
545   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
546     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
547     .clampScalar(0, S32, S64)
548     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
549     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
550     .widenScalarToNextPow2(0)
551     .scalarize(0);
552 
553   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
554                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
555     .legalFor({{S32, S1}, {S32, S32}})
556     .minScalar(0, S32)
557     // TODO: .scalarize(0)
558     .lower();
559 
560   getActionDefinitionsBuilder(G_BITCAST)
561     // Don't worry about the size constraint.
562     .legalIf(all(isRegisterType(0), isRegisterType(1)))
563     .lower();
564 
565 
566   getActionDefinitionsBuilder(G_CONSTANT)
567     .legalFor({S1, S32, S64, S16, GlobalPtr,
568                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
569     .legalIf(isPointer(0))
570     .clampScalar(0, S32, S64)
571     .widenScalarToNextPow2(0);
572 
573   getActionDefinitionsBuilder(G_FCONSTANT)
574     .legalFor({S32, S64, S16})
575     .clampScalar(0, S16, S64);
576 
577   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
578       .legalIf(isRegisterType(0))
579       // s1 and s16 are special cases because they have legal operations on
580       // them, but don't really occupy registers in the normal way.
581       .legalFor({S1, S16})
582       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
583       .clampScalarOrElt(0, S32, MaxScalar)
584       .widenScalarToNextPow2(0, 32)
585       .clampMaxNumElements(0, S32, 16);
586 
587   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
588 
589   // If the amount is divergent, we have to do a wave reduction to get the
590   // maximum value, so this is expanded during RegBankSelect.
591   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
592     .legalFor({{PrivatePtr, S32}});
593 
594   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
595     .customIf(typeIsNot(0, PrivatePtr));
596 
597   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
598 
599   auto &FPOpActions = getActionDefinitionsBuilder(
600     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
601     .legalFor({S32, S64});
602   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
603     .customFor({S32, S64});
604   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
605     .customFor({S32, S64});
606 
607   if (ST.has16BitInsts()) {
608     if (ST.hasVOP3PInsts())
609       FPOpActions.legalFor({S16, V2S16});
610     else
611       FPOpActions.legalFor({S16});
612 
613     TrigActions.customFor({S16});
614     FDIVActions.customFor({S16});
615   }
616 
617   auto &MinNumMaxNum = getActionDefinitionsBuilder({
618       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
619 
620   if (ST.hasVOP3PInsts()) {
621     MinNumMaxNum.customFor(FPTypesPK16)
622       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
623       .clampMaxNumElements(0, S16, 2)
624       .clampScalar(0, S16, S64)
625       .scalarize(0);
626   } else if (ST.has16BitInsts()) {
627     MinNumMaxNum.customFor(FPTypes16)
628       .clampScalar(0, S16, S64)
629       .scalarize(0);
630   } else {
631     MinNumMaxNum.customFor(FPTypesBase)
632       .clampScalar(0, S32, S64)
633       .scalarize(0);
634   }
635 
636   if (ST.hasVOP3PInsts())
637     FPOpActions.clampMaxNumElements(0, S16, 2);
638 
639   FPOpActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   TrigActions
644     .scalarize(0)
645     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
646 
647   FDIVActions
648     .scalarize(0)
649     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
650 
651   getActionDefinitionsBuilder({G_FNEG, G_FABS})
652     .legalFor(FPTypesPK16)
653     .clampMaxNumElements(0, S16, 2)
654     .scalarize(0)
655     .clampScalar(0, S16, S64);
656 
657   if (ST.has16BitInsts()) {
658     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
659       .legalFor({S32, S64, S16})
660       .scalarize(0)
661       .clampScalar(0, S16, S64);
662   } else {
663     getActionDefinitionsBuilder(G_FSQRT)
664       .legalFor({S32, S64})
665       .scalarize(0)
666       .clampScalar(0, S32, S64);
667 
668     if (ST.hasFractBug()) {
669       getActionDefinitionsBuilder(G_FFLOOR)
670         .customFor({S64})
671         .legalFor({S32, S64})
672         .scalarize(0)
673         .clampScalar(0, S32, S64);
674     } else {
675       getActionDefinitionsBuilder(G_FFLOOR)
676         .legalFor({S32, S64})
677         .scalarize(0)
678         .clampScalar(0, S32, S64);
679     }
680   }
681 
682   getActionDefinitionsBuilder(G_FPTRUNC)
683     .legalFor({{S32, S64}, {S16, S32}})
684     .scalarize(0)
685     .lower();
686 
687   getActionDefinitionsBuilder(G_FPEXT)
688     .legalFor({{S64, S32}, {S32, S16}})
689     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
690     .scalarize(0);
691 
692   getActionDefinitionsBuilder(G_FSUB)
693       // Use actual fsub instruction
694       .legalFor({S32})
695       // Must use fadd + fneg
696       .lowerFor({S64, S16, V2S16})
697       .scalarize(0)
698       .clampScalar(0, S32, S64);
699 
700   // Whether this is legal depends on the floating point mode for the function.
701   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
702   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
703     FMad.customFor({S32, S16});
704   else if (ST.hasMadMacF32Insts())
705     FMad.customFor({S32});
706   else if (ST.hasMadF16())
707     FMad.customFor({S16});
708   FMad.scalarize(0)
709       .lower();
710 
711   auto &FRem = getActionDefinitionsBuilder(G_FREM);
712   if (ST.has16BitInsts()) {
713     FRem.customFor({S16, S32, S64});
714   } else {
715     FRem.minScalar(0, S32)
716         .customFor({S32, S64});
717   }
718   FRem.scalarize(0);
719 
720   // TODO: Do we need to clamp maximum bitwidth?
721   getActionDefinitionsBuilder(G_TRUNC)
722     .legalIf(isScalar(0))
723     .legalFor({{V2S16, V2S32}})
724     .clampMaxNumElements(0, S16, 2)
725     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
726     // situations (like an invalid implicit use), we don't want to infinite loop
727     // in the legalizer.
728     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
729     .alwaysLegal();
730 
731   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
732     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
733                {S32, S1}, {S64, S1}, {S16, S1}})
734     .scalarize(0)
735     .clampScalar(0, S32, S64)
736     .widenScalarToNextPow2(1, 32);
737 
738   // TODO: Split s1->s64 during regbankselect for VALU.
739   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
740     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
741     .lowerFor({{S32, S64}})
742     .lowerIf(typeIs(1, S1))
743     .customFor({{S64, S64}});
744   if (ST.has16BitInsts())
745     IToFP.legalFor({{S16, S16}});
746   IToFP.clampScalar(1, S32, S64)
747        .minScalar(0, S32)
748        .scalarize(0)
749        .widenScalarToNextPow2(1);
750 
751   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
752     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
753     .customFor({{S64, S64}})
754     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
755   if (ST.has16BitInsts())
756     FPToI.legalFor({{S16, S16}});
757   else
758     FPToI.minScalar(1, S32);
759 
760   FPToI.minScalar(0, S32)
761        .scalarize(0)
762        .lower();
763 
764   // Lower roundeven into G_FRINT
765   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
766     .scalarize(0)
767     .lower();
768 
769   if (ST.has16BitInsts()) {
770     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
771       .legalFor({S16, S32, S64})
772       .clampScalar(0, S16, S64)
773       .scalarize(0);
774   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
775     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
776       .legalFor({S32, S64})
777       .clampScalar(0, S32, S64)
778       .scalarize(0);
779   } else {
780     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
781       .legalFor({S32})
782       .customFor({S64})
783       .clampScalar(0, S32, S64)
784       .scalarize(0);
785   }
786 
787   getActionDefinitionsBuilder(G_PTR_ADD)
788     .legalIf(all(isPointer(0), sameSize(0, 1)))
789     .scalarize(0)
790     .scalarSameSizeAs(1, 0);
791 
792   getActionDefinitionsBuilder(G_PTRMASK)
793     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
794     .scalarSameSizeAs(1, 0)
795     .scalarize(0);
796 
797   auto &CmpBuilder =
798     getActionDefinitionsBuilder(G_ICMP)
799     // The compare output type differs based on the register bank of the output,
800     // so make both s1 and s32 legal.
801     //
802     // Scalar compares producing output in scc will be promoted to s32, as that
803     // is the allocatable register type that will be needed for the copy from
804     // scc. This will be promoted during RegBankSelect, and we assume something
805     // before that won't try to use s32 result types.
806     //
807     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
808     // bank.
809     .legalForCartesianProduct(
810       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
811     .legalForCartesianProduct(
812       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
813   if (ST.has16BitInsts()) {
814     CmpBuilder.legalFor({{S1, S16}});
815   }
816 
817   CmpBuilder
818     .widenScalarToNextPow2(1)
819     .clampScalar(1, S32, S64)
820     .scalarize(0)
821     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
822 
823   getActionDefinitionsBuilder(G_FCMP)
824     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
825     .widenScalarToNextPow2(1)
826     .clampScalar(1, S32, S64)
827     .scalarize(0);
828 
829   // FIXME: fpow has a selection pattern that should move to custom lowering.
830   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
831   if (ST.has16BitInsts())
832     Exp2Ops.legalFor({S32, S16});
833   else
834     Exp2Ops.legalFor({S32});
835   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
836   Exp2Ops.scalarize(0);
837 
838   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
839   if (ST.has16BitInsts())
840     ExpOps.customFor({{S32}, {S16}});
841   else
842     ExpOps.customFor({S32});
843   ExpOps.clampScalar(0, MinScalarFPTy, S32)
844         .scalarize(0);
845 
846   getActionDefinitionsBuilder(G_FPOWI)
847     .clampScalar(0, MinScalarFPTy, S32)
848     .lower();
849 
850   // The 64-bit versions produce 32-bit results, but only on the SALU.
851   getActionDefinitionsBuilder(G_CTPOP)
852     .legalFor({{S32, S32}, {S32, S64}})
853     .clampScalar(0, S32, S32)
854     .clampScalar(1, S32, S64)
855     .scalarize(0)
856     .widenScalarToNextPow2(0, 32)
857     .widenScalarToNextPow2(1, 32);
858 
859   // The hardware instructions return a different result on 0 than the generic
860   // instructions expect. The hardware produces -1, but these produce the
861   // bitwidth.
862   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
863     .scalarize(0)
864     .clampScalar(0, S32, S32)
865     .clampScalar(1, S32, S64)
866     .widenScalarToNextPow2(0, 32)
867     .widenScalarToNextPow2(1, 32)
868     .lower();
869 
870   // The 64-bit versions produce 32-bit results, but only on the SALU.
871   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
872     .legalFor({{S32, S32}, {S32, S64}})
873     .clampScalar(0, S32, S32)
874     .clampScalar(1, S32, S64)
875     .scalarize(0)
876     .widenScalarToNextPow2(0, 32)
877     .widenScalarToNextPow2(1, 32);
878 
879   getActionDefinitionsBuilder(G_BITREVERSE)
880     .legalFor({S32})
881     .clampScalar(0, S32, S32)
882     .scalarize(0);
883 
884   if (ST.has16BitInsts()) {
885     getActionDefinitionsBuilder(G_BSWAP)
886       .legalFor({S16, S32, V2S16})
887       .clampMaxNumElements(0, S16, 2)
888       // FIXME: Fixing non-power-of-2 before clamp is workaround for
889       // narrowScalar limitation.
890       .widenScalarToNextPow2(0)
891       .clampScalar(0, S16, S32)
892       .scalarize(0);
893 
894     if (ST.hasVOP3PInsts()) {
895       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
896         .legalFor({S32, S16, V2S16})
897         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
898         .clampMaxNumElements(0, S16, 2)
899         .minScalar(0, S16)
900         .widenScalarToNextPow2(0)
901         .scalarize(0)
902         .lower();
903     } else {
904       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
905         .legalFor({S32, S16})
906         .widenScalarToNextPow2(0)
907         .minScalar(0, S16)
908         .scalarize(0)
909         .lower();
910     }
911   } else {
912     // TODO: Should have same legality without v_perm_b32
913     getActionDefinitionsBuilder(G_BSWAP)
914       .legalFor({S32})
915       .lowerIf(scalarNarrowerThan(0, 32))
916       // FIXME: Fixing non-power-of-2 before clamp is workaround for
917       // narrowScalar limitation.
918       .widenScalarToNextPow2(0)
919       .maxScalar(0, S32)
920       .scalarize(0)
921       .lower();
922 
923     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
924       .legalFor({S32})
925       .minScalar(0, S32)
926       .widenScalarToNextPow2(0)
927       .scalarize(0)
928       .lower();
929   }
930 
931   getActionDefinitionsBuilder(G_INTTOPTR)
932     // List the common cases
933     .legalForCartesianProduct(AddrSpaces64, {S64})
934     .legalForCartesianProduct(AddrSpaces32, {S32})
935     .scalarize(0)
936     // Accept any address space as long as the size matches
937     .legalIf(sameSize(0, 1))
938     .widenScalarIf(smallerThan(1, 0),
939       [](const LegalityQuery &Query) {
940         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
941       })
942     .narrowScalarIf(largerThan(1, 0),
943       [](const LegalityQuery &Query) {
944         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
945       });
946 
947   getActionDefinitionsBuilder(G_PTRTOINT)
948     // List the common cases
949     .legalForCartesianProduct(AddrSpaces64, {S64})
950     .legalForCartesianProduct(AddrSpaces32, {S32})
951     .scalarize(0)
952     // Accept any address space as long as the size matches
953     .legalIf(sameSize(0, 1))
954     .widenScalarIf(smallerThan(0, 1),
955       [](const LegalityQuery &Query) {
956         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
957       })
958     .narrowScalarIf(
959       largerThan(0, 1),
960       [](const LegalityQuery &Query) {
961         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
962       });
963 
964   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
965     .scalarize(0)
966     .custom();
967 
968   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
969                                     bool IsLoad) -> bool {
970     const LLT DstTy = Query.Types[0];
971 
972     // Split vector extloads.
973     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
974     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
975 
976     if (MemSize < DstTy.getSizeInBits())
977       MemSize = std::max(MemSize, AlignBits);
978 
979     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
980       return true;
981 
982     const LLT PtrTy = Query.Types[1];
983     unsigned AS = PtrTy.getAddressSpace();
984     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
985       return true;
986 
987     // Catch weird sized loads that don't evenly divide into the access sizes
988     // TODO: May be able to widen depending on alignment etc.
989     unsigned NumRegs = (MemSize + 31) / 32;
990     if (NumRegs == 3) {
991       if (!ST.hasDwordx3LoadStores())
992         return true;
993     } else {
994       // If the alignment allows, these should have been widened.
995       if (!isPowerOf2_32(NumRegs))
996         return true;
997     }
998 
999     if (AlignBits < MemSize) {
1000       const SITargetLowering *TLI = ST.getTargetLowering();
1001       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1002                                                       Align(AlignBits / 8));
1003     }
1004 
1005     return false;
1006   };
1007 
1008   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1009                                          unsigned Opc) -> bool {
1010     unsigned Size = Query.Types[0].getSizeInBits();
1011     if (isPowerOf2_32(Size))
1012       return false;
1013 
1014     if (Size == 96 && ST.hasDwordx3LoadStores())
1015       return false;
1016 
1017     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1018     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1019       return false;
1020 
1021     unsigned Align = Query.MMODescrs[0].AlignInBits;
1022     unsigned RoundedSize = NextPowerOf2(Size);
1023     return (Align >= RoundedSize);
1024   };
1025 
1026   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1027   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1028   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1029 
1030   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1031   // LDS
1032   // TODO: Unsupported flat for SI.
1033 
1034   for (unsigned Op : {G_LOAD, G_STORE}) {
1035     const bool IsStore = Op == G_STORE;
1036 
1037     auto &Actions = getActionDefinitionsBuilder(Op);
1038     // Explicitly list some common cases.
1039     // TODO: Does this help compile time at all?
1040     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1041                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1042                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1043                                       {S64, GlobalPtr, 64, GlobalAlign32},
1044                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1045                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1046                                       {S32, GlobalPtr, 8, GlobalAlign8},
1047                                       {S32, GlobalPtr, 16, GlobalAlign16},
1048 
1049                                       {S32, LocalPtr, 32, 32},
1050                                       {S64, LocalPtr, 64, 32},
1051                                       {V2S32, LocalPtr, 64, 32},
1052                                       {S32, LocalPtr, 8, 8},
1053                                       {S32, LocalPtr, 16, 16},
1054                                       {V2S16, LocalPtr, 32, 32},
1055 
1056                                       {S32, PrivatePtr, 32, 32},
1057                                       {S32, PrivatePtr, 8, 8},
1058                                       {S32, PrivatePtr, 16, 16},
1059                                       {V2S16, PrivatePtr, 32, 32},
1060 
1061                                       {S32, ConstantPtr, 32, GlobalAlign32},
1062                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1063                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1064                                       {S64, ConstantPtr, 64, GlobalAlign32},
1065                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1066     Actions.legalIf(
1067       [=](const LegalityQuery &Query) -> bool {
1068         return isLoadStoreLegal(ST, Query, Op);
1069       });
1070 
1071     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1072     // 64-bits.
1073     //
1074     // TODO: Should generalize bitcast action into coerce, which will also cover
1075     // inserting addrspacecasts.
1076     Actions.customIf(typeIs(1, Constant32Ptr));
1077 
1078     // Turn any illegal element vectors into something easier to deal
1079     // with. These will ultimately produce 32-bit scalar shifts to extract the
1080     // parts anyway.
1081     //
1082     // For odd 16-bit element vectors, prefer to split those into pieces with
1083     // 16-bit vector parts.
1084     Actions.bitcastIf(
1085       [=](const LegalityQuery &Query) -> bool {
1086         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1087                                           Query.MMODescrs[0].SizeInBits);
1088       }, bitcastToRegisterType(0));
1089 
1090     Actions
1091         .customIf(typeIs(1, Constant32Ptr))
1092         // Widen suitably aligned loads by loading extra elements.
1093         .moreElementsIf([=](const LegalityQuery &Query) {
1094             const LLT Ty = Query.Types[0];
1095             return Op == G_LOAD && Ty.isVector() &&
1096                    shouldWidenLoadResult(Query, Op);
1097           }, moreElementsToNextPow2(0))
1098         .widenScalarIf([=](const LegalityQuery &Query) {
1099             const LLT Ty = Query.Types[0];
1100             return Op == G_LOAD && !Ty.isVector() &&
1101                    shouldWidenLoadResult(Query, Op);
1102           }, widenScalarOrEltToNextPow2(0))
1103         .narrowScalarIf(
1104             [=](const LegalityQuery &Query) -> bool {
1105               return !Query.Types[0].isVector() &&
1106                      needToSplitMemOp(Query, Op == G_LOAD);
1107             },
1108             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1109               const LLT DstTy = Query.Types[0];
1110               const LLT PtrTy = Query.Types[1];
1111 
1112               const unsigned DstSize = DstTy.getSizeInBits();
1113               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1114 
1115               // Split extloads.
1116               if (DstSize > MemSize)
1117                 return std::make_pair(0, LLT::scalar(MemSize));
1118 
1119               if (!isPowerOf2_32(DstSize)) {
1120                 // We're probably decomposing an odd sized store. Try to split
1121                 // to the widest type. TODO: Account for alignment. As-is it
1122                 // should be OK, since the new parts will be further legalized.
1123                 unsigned FloorSize = PowerOf2Floor(DstSize);
1124                 return std::make_pair(0, LLT::scalar(FloorSize));
1125               }
1126 
1127               if (DstSize > 32 && (DstSize % 32 != 0)) {
1128                 // FIXME: Need a way to specify non-extload of larger size if
1129                 // suitably aligned.
1130                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1131               }
1132 
1133               unsigned MaxSize = maxSizeForAddrSpace(ST,
1134                                                      PtrTy.getAddressSpace(),
1135                                                      Op == G_LOAD);
1136               if (MemSize > MaxSize)
1137                 return std::make_pair(0, LLT::scalar(MaxSize));
1138 
1139               unsigned Align = Query.MMODescrs[0].AlignInBits;
1140               return std::make_pair(0, LLT::scalar(Align));
1141             })
1142         .fewerElementsIf(
1143             [=](const LegalityQuery &Query) -> bool {
1144               return Query.Types[0].isVector() &&
1145                      needToSplitMemOp(Query, Op == G_LOAD);
1146             },
1147             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1148               const LLT DstTy = Query.Types[0];
1149               const LLT PtrTy = Query.Types[1];
1150 
1151               LLT EltTy = DstTy.getElementType();
1152               unsigned MaxSize = maxSizeForAddrSpace(ST,
1153                                                      PtrTy.getAddressSpace(),
1154                                                      Op == G_LOAD);
1155 
1156               // FIXME: Handle widened to power of 2 results better. This ends
1157               // up scalarizing.
1158               // FIXME: 3 element stores scalarized on SI
1159 
1160               // Split if it's too large for the address space.
1161               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1162                 unsigned NumElts = DstTy.getNumElements();
1163                 unsigned EltSize = EltTy.getSizeInBits();
1164 
1165                 if (MaxSize % EltSize == 0) {
1166                   return std::make_pair(
1167                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1168                 }
1169 
1170                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1171 
1172                 // FIXME: Refine when odd breakdowns handled
1173                 // The scalars will need to be re-legalized.
1174                 if (NumPieces == 1 || NumPieces >= NumElts ||
1175                     NumElts % NumPieces != 0)
1176                   return std::make_pair(0, EltTy);
1177 
1178                 return std::make_pair(0,
1179                                       LLT::vector(NumElts / NumPieces, EltTy));
1180               }
1181 
1182               // FIXME: We could probably handle weird extending loads better.
1183               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1184               if (DstTy.getSizeInBits() > MemSize)
1185                 return std::make_pair(0, EltTy);
1186 
1187               unsigned EltSize = EltTy.getSizeInBits();
1188               unsigned DstSize = DstTy.getSizeInBits();
1189               if (!isPowerOf2_32(DstSize)) {
1190                 // We're probably decomposing an odd sized store. Try to split
1191                 // to the widest type. TODO: Account for alignment. As-is it
1192                 // should be OK, since the new parts will be further legalized.
1193                 unsigned FloorSize = PowerOf2Floor(DstSize);
1194                 return std::make_pair(
1195                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1196               }
1197 
1198               // Need to split because of alignment.
1199               unsigned Align = Query.MMODescrs[0].AlignInBits;
1200               if (EltSize > Align &&
1201                   (EltSize / Align < DstTy.getNumElements())) {
1202                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1203               }
1204 
1205               // May need relegalization for the scalars.
1206               return std::make_pair(0, EltTy);
1207             })
1208         .minScalar(0, S32);
1209 
1210     if (IsStore)
1211       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1212 
1213     // TODO: Need a bitcast lower option?
1214     Actions
1215         .widenScalarToNextPow2(0)
1216         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1217   }
1218 
1219   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1220                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1221                                                   {S32, GlobalPtr, 16, 2 * 8},
1222                                                   {S32, LocalPtr, 8, 8},
1223                                                   {S32, LocalPtr, 16, 16},
1224                                                   {S32, PrivatePtr, 8, 8},
1225                                                   {S32, PrivatePtr, 16, 16},
1226                                                   {S32, ConstantPtr, 8, 8},
1227                                                   {S32, ConstantPtr, 16, 2 * 8}});
1228   if (ST.hasFlatAddressSpace()) {
1229     ExtLoads.legalForTypesWithMemDesc(
1230         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1231   }
1232 
1233   ExtLoads.clampScalar(0, S32, S32)
1234           .widenScalarToNextPow2(0)
1235           .unsupportedIfMemSizeNotPow2()
1236           .lower();
1237 
1238   auto &Atomics = getActionDefinitionsBuilder(
1239     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1240      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1241      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1242      G_ATOMICRMW_UMIN})
1243     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1244                {S64, GlobalPtr}, {S64, LocalPtr},
1245                {S32, RegionPtr}, {S64, RegionPtr}});
1246   if (ST.hasFlatAddressSpace()) {
1247     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1248   }
1249 
1250   if (ST.hasLDSFPAtomics()) {
1251     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1252       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1253   }
1254 
1255   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1256   // demarshalling
1257   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1258     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1259                 {S32, FlatPtr}, {S64, FlatPtr}})
1260     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1261                {S32, RegionPtr}, {S64, RegionPtr}});
1262   // TODO: Pointer types, any 32-bit or 64-bit vector
1263 
1264   // Condition should be s32 for scalar, s1 for vector.
1265   getActionDefinitionsBuilder(G_SELECT)
1266     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1267           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1268           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1269     .clampScalar(0, S16, S64)
1270     .scalarize(1)
1271     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1272     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1273     .clampMaxNumElements(0, S32, 2)
1274     .clampMaxNumElements(0, LocalPtr, 2)
1275     .clampMaxNumElements(0, PrivatePtr, 2)
1276     .scalarize(0)
1277     .widenScalarToNextPow2(0)
1278     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1279 
1280   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1281   // be more flexible with the shift amount type.
1282   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1283     .legalFor({{S32, S32}, {S64, S32}});
1284   if (ST.has16BitInsts()) {
1285     if (ST.hasVOP3PInsts()) {
1286       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1287             .clampMaxNumElements(0, S16, 2);
1288     } else
1289       Shifts.legalFor({{S16, S16}});
1290 
1291     // TODO: Support 16-bit shift amounts for all types
1292     Shifts.widenScalarIf(
1293       [=](const LegalityQuery &Query) {
1294         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1295         // 32-bit amount.
1296         const LLT ValTy = Query.Types[0];
1297         const LLT AmountTy = Query.Types[1];
1298         return ValTy.getSizeInBits() <= 16 &&
1299                AmountTy.getSizeInBits() < 16;
1300       }, changeTo(1, S16));
1301     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1302     Shifts.clampScalar(1, S32, S32);
1303     Shifts.clampScalar(0, S16, S64);
1304     Shifts.widenScalarToNextPow2(0, 16);
1305 
1306     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1307       .minScalar(0, S16)
1308       .scalarize(0)
1309       .lower();
1310   } else {
1311     // Make sure we legalize the shift amount type first, as the general
1312     // expansion for the shifted type will produce much worse code if it hasn't
1313     // been truncated already.
1314     Shifts.clampScalar(1, S32, S32);
1315     Shifts.clampScalar(0, S32, S64);
1316     Shifts.widenScalarToNextPow2(0, 32);
1317 
1318     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1319       .minScalar(0, S32)
1320       .scalarize(0)
1321       .lower();
1322   }
1323   Shifts.scalarize(0);
1324 
1325   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1326     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1327     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1328     unsigned IdxTypeIdx = 2;
1329 
1330     getActionDefinitionsBuilder(Op)
1331       .customIf([=](const LegalityQuery &Query) {
1332           const LLT EltTy = Query.Types[EltTypeIdx];
1333           const LLT VecTy = Query.Types[VecTypeIdx];
1334           const LLT IdxTy = Query.Types[IdxTypeIdx];
1335           const unsigned EltSize = EltTy.getSizeInBits();
1336           return (EltSize == 32 || EltSize == 64) &&
1337                   VecTy.getSizeInBits() % 32 == 0 &&
1338                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1339                   IdxTy.getSizeInBits() == 32;
1340         })
1341       .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
1342                  bitcastToVectorElement32(1))
1343       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1344       .bitcastIf(
1345         all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
1346         [=](const LegalityQuery &Query) {
1347           // For > 64-bit element types, try to turn this into a 64-bit
1348           // element vector since we may be able to do better indexing
1349           // if this is scalar. If not, fall back to 32.
1350           const LLT EltTy = Query.Types[EltTypeIdx];
1351           const LLT VecTy = Query.Types[VecTypeIdx];
1352           const unsigned DstEltSize = EltTy.getSizeInBits();
1353           const unsigned VecSize = VecTy.getSizeInBits();
1354 
1355           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1356           return std::make_pair(
1357             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1358         })
1359       .clampScalar(EltTypeIdx, S32, S64)
1360       .clampScalar(VecTypeIdx, S32, S64)
1361       .clampScalar(IdxTypeIdx, S32, S32)
1362       .clampMaxNumElements(1, S32, 32)
1363       // TODO: Clamp elements for 64-bit vectors?
1364       // It should only be necessary with variable indexes.
1365       // As a last resort, lower to the stack
1366       .lower();
1367   }
1368 
1369   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1370     .unsupportedIf([=](const LegalityQuery &Query) {
1371         const LLT &EltTy = Query.Types[1].getElementType();
1372         return Query.Types[0] != EltTy;
1373       });
1374 
1375   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1376     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1377     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1378 
1379     // FIXME: Doesn't handle extract of illegal sizes.
1380     getActionDefinitionsBuilder(Op)
1381       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1382       // FIXME: Multiples of 16 should not be legal.
1383       .legalIf([=](const LegalityQuery &Query) {
1384           const LLT BigTy = Query.Types[BigTyIdx];
1385           const LLT LitTy = Query.Types[LitTyIdx];
1386           return (BigTy.getSizeInBits() % 32 == 0) &&
1387                  (LitTy.getSizeInBits() % 16 == 0);
1388         })
1389       .widenScalarIf(
1390         [=](const LegalityQuery &Query) {
1391           const LLT BigTy = Query.Types[BigTyIdx];
1392           return (BigTy.getScalarSizeInBits() < 16);
1393         },
1394         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1395       .widenScalarIf(
1396         [=](const LegalityQuery &Query) {
1397           const LLT LitTy = Query.Types[LitTyIdx];
1398           return (LitTy.getScalarSizeInBits() < 16);
1399         },
1400         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1401       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1402       .widenScalarToNextPow2(BigTyIdx, 32);
1403 
1404   }
1405 
1406   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1407     .legalForCartesianProduct(AllS32Vectors, {S32})
1408     .legalForCartesianProduct(AllS64Vectors, {S64})
1409     .clampNumElements(0, V16S32, V32S32)
1410     .clampNumElements(0, V2S64, V16S64)
1411     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1412 
1413   if (ST.hasScalarPackInsts()) {
1414     BuildVector
1415       // FIXME: Should probably widen s1 vectors straight to s32
1416       .minScalarOrElt(0, S16)
1417       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1418       .minScalar(1, S32);
1419 
1420     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1421       .legalFor({V2S16, S32})
1422       .lower();
1423     BuildVector.minScalarOrElt(0, S32);
1424   } else {
1425     BuildVector.customFor({V2S16, S16});
1426     BuildVector.minScalarOrElt(0, S32);
1427 
1428     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1429       .customFor({V2S16, S32})
1430       .lower();
1431   }
1432 
1433   BuildVector.legalIf(isRegisterType(0));
1434 
1435   // FIXME: Clamp maximum size
1436   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1437     .legalIf(isRegisterType(0));
1438 
1439   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1440   // pre-legalize.
1441   if (ST.hasVOP3PInsts()) {
1442     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1443       .customFor({V2S16, V2S16})
1444       .lower();
1445   } else
1446     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1447 
1448   // Merge/Unmerge
1449   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1450     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1451     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1452 
1453     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1454       const LLT Ty = Query.Types[TypeIdx];
1455       if (Ty.isVector()) {
1456         const LLT &EltTy = Ty.getElementType();
1457         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1458           return true;
1459         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1460           return true;
1461       }
1462       return false;
1463     };
1464 
1465     auto &Builder = getActionDefinitionsBuilder(Op)
1466       .lowerFor({{S16, V2S16}})
1467       .lowerIf([=](const LegalityQuery &Query) {
1468           const LLT BigTy = Query.Types[BigTyIdx];
1469           return BigTy.getSizeInBits() == 32;
1470         })
1471       // Try to widen to s16 first for small types.
1472       // TODO: Only do this on targets with legal s16 shifts
1473       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1474       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1475       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1476       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1477                            elementTypeIs(1, S16)),
1478                        changeTo(1, V2S16))
1479       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1480       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1481       // valid.
1482       .clampScalar(LitTyIdx, S32, S512)
1483       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1484       // Break up vectors with weird elements into scalars
1485       .fewerElementsIf(
1486         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1487         scalarize(0))
1488       .fewerElementsIf(
1489         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1490         scalarize(1))
1491       .clampScalar(BigTyIdx, S32, MaxScalar);
1492 
1493     if (Op == G_MERGE_VALUES) {
1494       Builder.widenScalarIf(
1495         // TODO: Use 16-bit shifts if legal for 8-bit values?
1496         [=](const LegalityQuery &Query) {
1497           const LLT Ty = Query.Types[LitTyIdx];
1498           return Ty.getSizeInBits() < 32;
1499         },
1500         changeTo(LitTyIdx, S32));
1501     }
1502 
1503     Builder.widenScalarIf(
1504       [=](const LegalityQuery &Query) {
1505         const LLT Ty = Query.Types[BigTyIdx];
1506         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1507           Ty.getSizeInBits() % 16 != 0;
1508       },
1509       [=](const LegalityQuery &Query) {
1510         // Pick the next power of 2, or a multiple of 64 over 128.
1511         // Whichever is smaller.
1512         const LLT &Ty = Query.Types[BigTyIdx];
1513         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1514         if (NewSizeInBits >= 256) {
1515           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1516           if (RoundedTo < NewSizeInBits)
1517             NewSizeInBits = RoundedTo;
1518         }
1519         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1520       })
1521       .legalIf([=](const LegalityQuery &Query) {
1522           const LLT &BigTy = Query.Types[BigTyIdx];
1523           const LLT &LitTy = Query.Types[LitTyIdx];
1524 
1525           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1526             return false;
1527           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1528             return false;
1529 
1530           return BigTy.getSizeInBits() % 16 == 0 &&
1531                  LitTy.getSizeInBits() % 16 == 0 &&
1532                  BigTy.getSizeInBits() <= MaxRegisterSize;
1533         })
1534       // Any vectors left are the wrong size. Scalarize them.
1535       .scalarize(0)
1536       .scalarize(1);
1537   }
1538 
1539   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1540   // RegBankSelect.
1541   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1542     .legalFor({{S32}, {S64}});
1543 
1544   if (ST.hasVOP3PInsts()) {
1545     SextInReg.lowerFor({{V2S16}})
1546       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1547       // get more vector shift opportunities, since we'll get those when
1548       // expanded.
1549       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1550   } else if (ST.has16BitInsts()) {
1551     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1552   } else {
1553     // Prefer to promote to s32 before lowering if we don't have 16-bit
1554     // shifts. This avoid a lot of intermediate truncate and extend operations.
1555     SextInReg.lowerFor({{S32}, {S64}});
1556   }
1557 
1558   SextInReg
1559     .scalarize(0)
1560     .clampScalar(0, S32, S64)
1561     .lower();
1562 
1563   getActionDefinitionsBuilder(G_FSHR)
1564     .legalFor({{S32, S32}})
1565     .scalarize(0)
1566     .lower();
1567 
1568   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1569     .legalFor({S64});
1570 
1571   getActionDefinitionsBuilder(G_FENCE)
1572     .alwaysLegal();
1573 
1574   getActionDefinitionsBuilder({
1575       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1576       G_FCOPYSIGN,
1577 
1578       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1579       G_ATOMICRMW_NAND,
1580       G_ATOMICRMW_FSUB,
1581       G_READ_REGISTER,
1582       G_WRITE_REGISTER,
1583 
1584       G_SADDO, G_SSUBO,
1585 
1586        // TODO: Implement
1587       G_FMINIMUM, G_FMAXIMUM,
1588       G_FSHL
1589     }).lower();
1590 
1591   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1592         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1593         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1594     .unsupported();
1595 
1596   computeTables();
1597   verify(*ST.getInstrInfo());
1598 }
1599 
1600 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1601                                          MachineInstr &MI) const {
1602   MachineIRBuilder &B = Helper.MIRBuilder;
1603   MachineRegisterInfo &MRI = *B.getMRI();
1604   GISelChangeObserver &Observer = Helper.Observer;
1605 
1606   switch (MI.getOpcode()) {
1607   case TargetOpcode::G_ADDRSPACE_CAST:
1608     return legalizeAddrSpaceCast(MI, MRI, B);
1609   case TargetOpcode::G_FRINT:
1610     return legalizeFrint(MI, MRI, B);
1611   case TargetOpcode::G_FCEIL:
1612     return legalizeFceil(MI, MRI, B);
1613   case TargetOpcode::G_FREM:
1614     return legalizeFrem(MI, MRI, B);
1615   case TargetOpcode::G_INTRINSIC_TRUNC:
1616     return legalizeIntrinsicTrunc(MI, MRI, B);
1617   case TargetOpcode::G_SITOFP:
1618     return legalizeITOFP(MI, MRI, B, true);
1619   case TargetOpcode::G_UITOFP:
1620     return legalizeITOFP(MI, MRI, B, false);
1621   case TargetOpcode::G_FPTOSI:
1622     return legalizeFPTOI(MI, MRI, B, true);
1623   case TargetOpcode::G_FPTOUI:
1624     return legalizeFPTOI(MI, MRI, B, false);
1625   case TargetOpcode::G_FMINNUM:
1626   case TargetOpcode::G_FMAXNUM:
1627   case TargetOpcode::G_FMINNUM_IEEE:
1628   case TargetOpcode::G_FMAXNUM_IEEE:
1629     return legalizeMinNumMaxNum(Helper, MI);
1630   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1631     return legalizeExtractVectorElt(MI, MRI, B);
1632   case TargetOpcode::G_INSERT_VECTOR_ELT:
1633     return legalizeInsertVectorElt(MI, MRI, B);
1634   case TargetOpcode::G_SHUFFLE_VECTOR:
1635     return legalizeShuffleVector(MI, MRI, B);
1636   case TargetOpcode::G_FSIN:
1637   case TargetOpcode::G_FCOS:
1638     return legalizeSinCos(MI, MRI, B);
1639   case TargetOpcode::G_GLOBAL_VALUE:
1640     return legalizeGlobalValue(MI, MRI, B);
1641   case TargetOpcode::G_LOAD:
1642     return legalizeLoad(MI, MRI, B, Observer);
1643   case TargetOpcode::G_FMAD:
1644     return legalizeFMad(MI, MRI, B);
1645   case TargetOpcode::G_FDIV:
1646     return legalizeFDIV(MI, MRI, B);
1647   case TargetOpcode::G_UDIV:
1648   case TargetOpcode::G_UREM:
1649     return legalizeUDIV_UREM(MI, MRI, B);
1650   case TargetOpcode::G_SDIV:
1651   case TargetOpcode::G_SREM:
1652     return legalizeSDIV_SREM(MI, MRI, B);
1653   case TargetOpcode::G_ATOMIC_CMPXCHG:
1654     return legalizeAtomicCmpXChg(MI, MRI, B);
1655   case TargetOpcode::G_FLOG:
1656     return legalizeFlog(MI, B, numbers::ln2f);
1657   case TargetOpcode::G_FLOG10:
1658     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1659   case TargetOpcode::G_FEXP:
1660     return legalizeFExp(MI, B);
1661   case TargetOpcode::G_FPOW:
1662     return legalizeFPow(MI, B);
1663   case TargetOpcode::G_FFLOOR:
1664     return legalizeFFloor(MI, MRI, B);
1665   case TargetOpcode::G_BUILD_VECTOR:
1666     return legalizeBuildVector(MI, MRI, B);
1667   default:
1668     return false;
1669   }
1670 
1671   llvm_unreachable("expected switch to return");
1672 }
1673 
1674 Register AMDGPULegalizerInfo::getSegmentAperture(
1675   unsigned AS,
1676   MachineRegisterInfo &MRI,
1677   MachineIRBuilder &B) const {
1678   MachineFunction &MF = B.getMF();
1679   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1680   const LLT S32 = LLT::scalar(32);
1681 
1682   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1683 
1684   if (ST.hasApertureRegs()) {
1685     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1686     // getreg.
1687     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1688         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1689         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1690     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1691         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1692         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1693     unsigned Encoding =
1694         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1695         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1696         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1697 
1698     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1699 
1700     B.buildInstr(AMDGPU::S_GETREG_B32)
1701       .addDef(GetReg)
1702       .addImm(Encoding);
1703     MRI.setType(GetReg, S32);
1704 
1705     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1706     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1707   }
1708 
1709   Register QueuePtr = MRI.createGenericVirtualRegister(
1710     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1711 
1712   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1713     return Register();
1714 
1715   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1716   // private_segment_aperture_base_hi.
1717   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1718 
1719   // TODO: can we be smarter about machine pointer info?
1720   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1721   MachineMemOperand *MMO = MF.getMachineMemOperand(
1722       PtrInfo,
1723       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1724           MachineMemOperand::MOInvariant,
1725       4, commonAlignment(Align(64), StructOffset));
1726 
1727   Register LoadAddr;
1728 
1729   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1730   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1731 }
1732 
1733 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1734   MachineInstr &MI, MachineRegisterInfo &MRI,
1735   MachineIRBuilder &B) const {
1736   MachineFunction &MF = B.getMF();
1737 
1738   const LLT S32 = LLT::scalar(32);
1739   Register Dst = MI.getOperand(0).getReg();
1740   Register Src = MI.getOperand(1).getReg();
1741 
1742   LLT DstTy = MRI.getType(Dst);
1743   LLT SrcTy = MRI.getType(Src);
1744   unsigned DestAS = DstTy.getAddressSpace();
1745   unsigned SrcAS = SrcTy.getAddressSpace();
1746 
1747   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1748   // vector element.
1749   assert(!DstTy.isVector());
1750 
1751   const AMDGPUTargetMachine &TM
1752     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1753 
1754   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1755     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1756     return true;
1757   }
1758 
1759   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1760     // Truncate.
1761     B.buildExtract(Dst, Src, 0);
1762     MI.eraseFromParent();
1763     return true;
1764   }
1765 
1766   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1767     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1768     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1769 
1770     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1771     // another. Merge operands are required to be the same type, but creating an
1772     // extra ptrtoint would be kind of pointless.
1773     auto HighAddr = B.buildConstant(
1774       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1775     B.buildMerge(Dst, {Src, HighAddr});
1776     MI.eraseFromParent();
1777     return true;
1778   }
1779 
1780   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1781     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1782            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1783     unsigned NullVal = TM.getNullPointerValue(DestAS);
1784 
1785     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1786     auto FlatNull = B.buildConstant(SrcTy, 0);
1787 
1788     // Extract low 32-bits of the pointer.
1789     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1790 
1791     auto CmpRes =
1792         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1793     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1794 
1795     MI.eraseFromParent();
1796     return true;
1797   }
1798 
1799   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1800     return false;
1801 
1802   if (!ST.hasFlatAddressSpace())
1803     return false;
1804 
1805   auto SegmentNull =
1806       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1807   auto FlatNull =
1808       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1809 
1810   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1811   if (!ApertureReg.isValid())
1812     return false;
1813 
1814   auto CmpRes =
1815       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1816 
1817   // Coerce the type of the low half of the result so we can use merge_values.
1818   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1819 
1820   // TODO: Should we allow mismatched types but matching sizes in merges to
1821   // avoid the ptrtoint?
1822   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1823   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1824 
1825   MI.eraseFromParent();
1826   return true;
1827 }
1828 
1829 bool AMDGPULegalizerInfo::legalizeFrint(
1830   MachineInstr &MI, MachineRegisterInfo &MRI,
1831   MachineIRBuilder &B) const {
1832   Register Src = MI.getOperand(1).getReg();
1833   LLT Ty = MRI.getType(Src);
1834   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1835 
1836   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1837   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1838 
1839   auto C1 = B.buildFConstant(Ty, C1Val);
1840   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1841 
1842   // TODO: Should this propagate fast-math-flags?
1843   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1844   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1845 
1846   auto C2 = B.buildFConstant(Ty, C2Val);
1847   auto Fabs = B.buildFAbs(Ty, Src);
1848 
1849   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1850   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1851   MI.eraseFromParent();
1852   return true;
1853 }
1854 
1855 bool AMDGPULegalizerInfo::legalizeFceil(
1856   MachineInstr &MI, MachineRegisterInfo &MRI,
1857   MachineIRBuilder &B) const {
1858 
1859   const LLT S1 = LLT::scalar(1);
1860   const LLT S64 = LLT::scalar(64);
1861 
1862   Register Src = MI.getOperand(1).getReg();
1863   assert(MRI.getType(Src) == S64);
1864 
1865   // result = trunc(src)
1866   // if (src > 0.0 && src != result)
1867   //   result += 1.0
1868 
1869   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1870 
1871   const auto Zero = B.buildFConstant(S64, 0.0);
1872   const auto One = B.buildFConstant(S64, 1.0);
1873   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1874   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1875   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1876   auto Add = B.buildSelect(S64, And, One, Zero);
1877 
1878   // TODO: Should this propagate fast-math-flags?
1879   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeFrem(
1884   MachineInstr &MI, MachineRegisterInfo &MRI,
1885   MachineIRBuilder &B) const {
1886     Register DstReg = MI.getOperand(0).getReg();
1887     Register Src0Reg = MI.getOperand(1).getReg();
1888     Register Src1Reg = MI.getOperand(2).getReg();
1889     auto Flags = MI.getFlags();
1890     LLT Ty = MRI.getType(DstReg);
1891 
1892     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1893     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1894     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1895     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1896     MI.eraseFromParent();
1897     return true;
1898 }
1899 
1900 static MachineInstrBuilder extractF64Exponent(Register Hi,
1901                                               MachineIRBuilder &B) {
1902   const unsigned FractBits = 52;
1903   const unsigned ExpBits = 11;
1904   LLT S32 = LLT::scalar(32);
1905 
1906   auto Const0 = B.buildConstant(S32, FractBits - 32);
1907   auto Const1 = B.buildConstant(S32, ExpBits);
1908 
1909   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1910     .addUse(Hi)
1911     .addUse(Const0.getReg(0))
1912     .addUse(Const1.getReg(0));
1913 
1914   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1915 }
1916 
1917 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1918   MachineInstr &MI, MachineRegisterInfo &MRI,
1919   MachineIRBuilder &B) const {
1920   const LLT S1 = LLT::scalar(1);
1921   const LLT S32 = LLT::scalar(32);
1922   const LLT S64 = LLT::scalar(64);
1923 
1924   Register Src = MI.getOperand(1).getReg();
1925   assert(MRI.getType(Src) == S64);
1926 
1927   // TODO: Should this use extract since the low half is unused?
1928   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1929   Register Hi = Unmerge.getReg(1);
1930 
1931   // Extract the upper half, since this is where we will find the sign and
1932   // exponent.
1933   auto Exp = extractF64Exponent(Hi, B);
1934 
1935   const unsigned FractBits = 52;
1936 
1937   // Extract the sign bit.
1938   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1939   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1940 
1941   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1942 
1943   const auto Zero32 = B.buildConstant(S32, 0);
1944 
1945   // Extend back to 64-bits.
1946   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1947 
1948   auto Shr = B.buildAShr(S64, FractMask, Exp);
1949   auto Not = B.buildNot(S64, Shr);
1950   auto Tmp0 = B.buildAnd(S64, Src, Not);
1951   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1952 
1953   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1954   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1955 
1956   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1957   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1958   MI.eraseFromParent();
1959   return true;
1960 }
1961 
1962 bool AMDGPULegalizerInfo::legalizeITOFP(
1963   MachineInstr &MI, MachineRegisterInfo &MRI,
1964   MachineIRBuilder &B, bool Signed) const {
1965 
1966   Register Dst = MI.getOperand(0).getReg();
1967   Register Src = MI.getOperand(1).getReg();
1968 
1969   const LLT S64 = LLT::scalar(64);
1970   const LLT S32 = LLT::scalar(32);
1971 
1972   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1973 
1974   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1975 
1976   auto CvtHi = Signed ?
1977     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1978     B.buildUITOFP(S64, Unmerge.getReg(1));
1979 
1980   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1981 
1982   auto ThirtyTwo = B.buildConstant(S32, 32);
1983   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1984     .addUse(CvtHi.getReg(0))
1985     .addUse(ThirtyTwo.getReg(0));
1986 
1987   // TODO: Should this propagate fast-math-flags?
1988   B.buildFAdd(Dst, LdExp, CvtLo);
1989   MI.eraseFromParent();
1990   return true;
1991 }
1992 
1993 // TODO: Copied from DAG implementation. Verify logic and document how this
1994 // actually works.
1995 bool AMDGPULegalizerInfo::legalizeFPTOI(
1996   MachineInstr &MI, MachineRegisterInfo &MRI,
1997   MachineIRBuilder &B, bool Signed) const {
1998 
1999   Register Dst = MI.getOperand(0).getReg();
2000   Register Src = MI.getOperand(1).getReg();
2001 
2002   const LLT S64 = LLT::scalar(64);
2003   const LLT S32 = LLT::scalar(32);
2004 
2005   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2006 
2007   unsigned Flags = MI.getFlags();
2008 
2009   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2010   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2011   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2012 
2013   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2014   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2015   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2016 
2017   auto Hi = Signed ?
2018     B.buildFPTOSI(S32, FloorMul) :
2019     B.buildFPTOUI(S32, FloorMul);
2020   auto Lo = B.buildFPTOUI(S32, Fma);
2021 
2022   B.buildMerge(Dst, { Lo, Hi });
2023   MI.eraseFromParent();
2024 
2025   return true;
2026 }
2027 
2028 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2029                                                MachineInstr &MI) const {
2030   MachineFunction &MF = Helper.MIRBuilder.getMF();
2031   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2032 
2033   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2034                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2035 
2036   // With ieee_mode disabled, the instructions have the correct behavior
2037   // already for G_FMINNUM/G_FMAXNUM
2038   if (!MFI->getMode().IEEE)
2039     return !IsIEEEOp;
2040 
2041   if (IsIEEEOp)
2042     return true;
2043 
2044   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2045 }
2046 
2047 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2048   MachineInstr &MI, MachineRegisterInfo &MRI,
2049   MachineIRBuilder &B) const {
2050   // TODO: Should move some of this into LegalizerHelper.
2051 
2052   // TODO: Promote dynamic indexing of s16 to s32
2053 
2054   // FIXME: Artifact combiner probably should have replaced the truncated
2055   // constant before this, so we shouldn't need
2056   // getConstantVRegValWithLookThrough.
2057   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2058     MI.getOperand(2).getReg(), MRI);
2059   if (!IdxVal) // Dynamic case will be selected to register indexing.
2060     return true;
2061 
2062   Register Dst = MI.getOperand(0).getReg();
2063   Register Vec = MI.getOperand(1).getReg();
2064 
2065   LLT VecTy = MRI.getType(Vec);
2066   LLT EltTy = VecTy.getElementType();
2067   assert(EltTy == MRI.getType(Dst));
2068 
2069   if (IdxVal->Value < VecTy.getNumElements())
2070     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2071   else
2072     B.buildUndef(Dst);
2073 
2074   MI.eraseFromParent();
2075   return true;
2076 }
2077 
2078 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2079   MachineInstr &MI, MachineRegisterInfo &MRI,
2080   MachineIRBuilder &B) const {
2081   // TODO: Should move some of this into LegalizerHelper.
2082 
2083   // TODO: Promote dynamic indexing of s16 to s32
2084 
2085   // FIXME: Artifact combiner probably should have replaced the truncated
2086   // constant before this, so we shouldn't need
2087   // getConstantVRegValWithLookThrough.
2088   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2089     MI.getOperand(3).getReg(), MRI);
2090   if (!IdxVal) // Dynamic case will be selected to register indexing.
2091     return true;
2092 
2093   Register Dst = MI.getOperand(0).getReg();
2094   Register Vec = MI.getOperand(1).getReg();
2095   Register Ins = MI.getOperand(2).getReg();
2096 
2097   LLT VecTy = MRI.getType(Vec);
2098   LLT EltTy = VecTy.getElementType();
2099   assert(EltTy == MRI.getType(Ins));
2100 
2101   if (IdxVal->Value < VecTy.getNumElements())
2102     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2103   else
2104     B.buildUndef(Dst);
2105 
2106   MI.eraseFromParent();
2107   return true;
2108 }
2109 
2110 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2111   MachineInstr &MI, MachineRegisterInfo &MRI,
2112   MachineIRBuilder &B) const {
2113   const LLT V2S16 = LLT::vector(2, 16);
2114 
2115   Register Dst = MI.getOperand(0).getReg();
2116   Register Src0 = MI.getOperand(1).getReg();
2117   LLT DstTy = MRI.getType(Dst);
2118   LLT SrcTy = MRI.getType(Src0);
2119 
2120   if (SrcTy == V2S16 && DstTy == V2S16 &&
2121       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2122     return true;
2123 
2124   MachineIRBuilder HelperBuilder(MI);
2125   GISelObserverWrapper DummyObserver;
2126   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2127   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2128 }
2129 
2130 bool AMDGPULegalizerInfo::legalizeSinCos(
2131   MachineInstr &MI, MachineRegisterInfo &MRI,
2132   MachineIRBuilder &B) const {
2133 
2134   Register DstReg = MI.getOperand(0).getReg();
2135   Register SrcReg = MI.getOperand(1).getReg();
2136   LLT Ty = MRI.getType(DstReg);
2137   unsigned Flags = MI.getFlags();
2138 
2139   Register TrigVal;
2140   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2141   if (ST.hasTrigReducedRange()) {
2142     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2143     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2144       .addUse(MulVal.getReg(0))
2145       .setMIFlags(Flags).getReg(0);
2146   } else
2147     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2148 
2149   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2150     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2151   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2152     .addUse(TrigVal)
2153     .setMIFlags(Flags);
2154   MI.eraseFromParent();
2155   return true;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2159                                                   MachineIRBuilder &B,
2160                                                   const GlobalValue *GV,
2161                                                   int64_t Offset,
2162                                                   unsigned GAFlags) const {
2163   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2164   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2165   // to the following code sequence:
2166   //
2167   // For constant address space:
2168   //   s_getpc_b64 s[0:1]
2169   //   s_add_u32 s0, s0, $symbol
2170   //   s_addc_u32 s1, s1, 0
2171   //
2172   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2173   //   a fixup or relocation is emitted to replace $symbol with a literal
2174   //   constant, which is a pc-relative offset from the encoding of the $symbol
2175   //   operand to the global variable.
2176   //
2177   // For global address space:
2178   //   s_getpc_b64 s[0:1]
2179   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2180   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2181   //
2182   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2183   //   fixups or relocations are emitted to replace $symbol@*@lo and
2184   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2185   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2186   //   operand to the global variable.
2187   //
2188   // What we want here is an offset from the value returned by s_getpc
2189   // (which is the address of the s_add_u32 instruction) to the global
2190   // variable, but since the encoding of $symbol starts 4 bytes after the start
2191   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2192   // small. This requires us to add 4 to the global variable offset in order to
2193   // compute the correct address.
2194 
2195   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2196 
2197   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2198     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2199 
2200   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2201     .addDef(PCReg);
2202 
2203   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2204   if (GAFlags == SIInstrInfo::MO_NONE)
2205     MIB.addImm(0);
2206   else
2207     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2208 
2209   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2210 
2211   if (PtrTy.getSizeInBits() == 32)
2212     B.buildExtract(DstReg, PCReg, 0);
2213   return true;
2214  }
2215 
2216 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2217   MachineInstr &MI, MachineRegisterInfo &MRI,
2218   MachineIRBuilder &B) const {
2219   Register DstReg = MI.getOperand(0).getReg();
2220   LLT Ty = MRI.getType(DstReg);
2221   unsigned AS = Ty.getAddressSpace();
2222 
2223   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2224   MachineFunction &MF = B.getMF();
2225   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2226 
2227   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2228     if (!MFI->isEntryFunction()) {
2229       const Function &Fn = MF.getFunction();
2230       DiagnosticInfoUnsupported BadLDSDecl(
2231         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2232         DS_Warning);
2233       Fn.getContext().diagnose(BadLDSDecl);
2234 
2235       // We currently don't have a way to correctly allocate LDS objects that
2236       // aren't directly associated with a kernel. We do force inlining of
2237       // functions that use local objects. However, if these dead functions are
2238       // not eliminated, we don't want a compile time error. Just emit a warning
2239       // and a trap, since there should be no callable path here.
2240       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2241       B.buildUndef(DstReg);
2242       MI.eraseFromParent();
2243       return true;
2244     }
2245 
2246     // TODO: We could emit code to handle the initialization somewhere.
2247     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2248       const SITargetLowering *TLI = ST.getTargetLowering();
2249       if (!TLI->shouldUseLDSConstAddress(GV)) {
2250         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2251         return true; // Leave in place;
2252       }
2253 
2254       B.buildConstant(
2255           DstReg,
2256           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2257       MI.eraseFromParent();
2258       return true;
2259     }
2260 
2261     const Function &Fn = MF.getFunction();
2262     DiagnosticInfoUnsupported BadInit(
2263       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2264     Fn.getContext().diagnose(BadInit);
2265     return true;
2266   }
2267 
2268   const SITargetLowering *TLI = ST.getTargetLowering();
2269 
2270   if (TLI->shouldEmitFixup(GV)) {
2271     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2272     MI.eraseFromParent();
2273     return true;
2274   }
2275 
2276   if (TLI->shouldEmitPCReloc(GV)) {
2277     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2278     MI.eraseFromParent();
2279     return true;
2280   }
2281 
2282   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2283   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2284 
2285   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2286       MachinePointerInfo::getGOT(MF),
2287       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2288           MachineMemOperand::MOInvariant,
2289       8 /*Size*/, Align(8));
2290 
2291   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2292 
2293   if (Ty.getSizeInBits() == 32) {
2294     // Truncate if this is a 32-bit constant adrdess.
2295     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2296     B.buildExtract(DstReg, Load, 0);
2297   } else
2298     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2299 
2300   MI.eraseFromParent();
2301   return true;
2302 }
2303 
2304 bool AMDGPULegalizerInfo::legalizeLoad(
2305   MachineInstr &MI, MachineRegisterInfo &MRI,
2306   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2307   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2308   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2309   Observer.changingInstr(MI);
2310   MI.getOperand(1).setReg(Cast.getReg(0));
2311   Observer.changedInstr(MI);
2312   return true;
2313 }
2314 
2315 bool AMDGPULegalizerInfo::legalizeFMad(
2316   MachineInstr &MI, MachineRegisterInfo &MRI,
2317   MachineIRBuilder &B) const {
2318   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2319   assert(Ty.isScalar());
2320 
2321   MachineFunction &MF = B.getMF();
2322   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2323 
2324   // TODO: Always legal with future ftz flag.
2325   // FIXME: Do we need just output?
2326   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2327     return true;
2328   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2329     return true;
2330 
2331   MachineIRBuilder HelperBuilder(MI);
2332   GISelObserverWrapper DummyObserver;
2333   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2334   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2335 }
2336 
2337 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2338   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2339   Register DstReg = MI.getOperand(0).getReg();
2340   Register PtrReg = MI.getOperand(1).getReg();
2341   Register CmpVal = MI.getOperand(2).getReg();
2342   Register NewVal = MI.getOperand(3).getReg();
2343 
2344   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2345          "this should not have been custom lowered");
2346 
2347   LLT ValTy = MRI.getType(CmpVal);
2348   LLT VecTy = LLT::vector(2, ValTy);
2349 
2350   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2351 
2352   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2353     .addDef(DstReg)
2354     .addUse(PtrReg)
2355     .addUse(PackedVal)
2356     .setMemRefs(MI.memoperands());
2357 
2358   MI.eraseFromParent();
2359   return true;
2360 }
2361 
2362 bool AMDGPULegalizerInfo::legalizeFlog(
2363   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2364   Register Dst = MI.getOperand(0).getReg();
2365   Register Src = MI.getOperand(1).getReg();
2366   LLT Ty = B.getMRI()->getType(Dst);
2367   unsigned Flags = MI.getFlags();
2368 
2369   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2370   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2371 
2372   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2373   MI.eraseFromParent();
2374   return true;
2375 }
2376 
2377 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2378                                        MachineIRBuilder &B) const {
2379   Register Dst = MI.getOperand(0).getReg();
2380   Register Src = MI.getOperand(1).getReg();
2381   unsigned Flags = MI.getFlags();
2382   LLT Ty = B.getMRI()->getType(Dst);
2383 
2384   auto K = B.buildFConstant(Ty, numbers::log2e);
2385   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2386   B.buildFExp2(Dst, Mul, Flags);
2387   MI.eraseFromParent();
2388   return true;
2389 }
2390 
2391 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2392                                        MachineIRBuilder &B) const {
2393   Register Dst = MI.getOperand(0).getReg();
2394   Register Src0 = MI.getOperand(1).getReg();
2395   Register Src1 = MI.getOperand(2).getReg();
2396   unsigned Flags = MI.getFlags();
2397   LLT Ty = B.getMRI()->getType(Dst);
2398   const LLT S16 = LLT::scalar(16);
2399   const LLT S32 = LLT::scalar(32);
2400 
2401   if (Ty == S32) {
2402     auto Log = B.buildFLog2(S32, Src0, Flags);
2403     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2404       .addUse(Log.getReg(0))
2405       .addUse(Src1)
2406       .setMIFlags(Flags);
2407     B.buildFExp2(Dst, Mul, Flags);
2408   } else if (Ty == S16) {
2409     // There's no f16 fmul_legacy, so we need to convert for it.
2410     auto Log = B.buildFLog2(S16, Src0, Flags);
2411     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2412     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2413     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2414       .addUse(Ext0.getReg(0))
2415       .addUse(Ext1.getReg(0))
2416       .setMIFlags(Flags);
2417 
2418     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2419   } else
2420     return false;
2421 
2422   MI.eraseFromParent();
2423   return true;
2424 }
2425 
2426 // Find a source register, ignoring any possible source modifiers.
2427 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2428   Register ModSrc = OrigSrc;
2429   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2430     ModSrc = SrcFNeg->getOperand(1).getReg();
2431     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2432       ModSrc = SrcFAbs->getOperand(1).getReg();
2433   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2434     ModSrc = SrcFAbs->getOperand(1).getReg();
2435   return ModSrc;
2436 }
2437 
2438 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2439                                          MachineRegisterInfo &MRI,
2440                                          MachineIRBuilder &B) const {
2441 
2442   const LLT S1 = LLT::scalar(1);
2443   const LLT S64 = LLT::scalar(64);
2444   Register Dst = MI.getOperand(0).getReg();
2445   Register OrigSrc = MI.getOperand(1).getReg();
2446   unsigned Flags = MI.getFlags();
2447   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2448          "this should not have been custom lowered");
2449 
2450   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2451   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2452   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2453   // V_FRACT bug is:
2454   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2455   //
2456   // Convert floor(x) to (x - fract(x))
2457 
2458   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2459     .addUse(OrigSrc)
2460     .setMIFlags(Flags);
2461 
2462   // Give source modifier matching some assistance before obscuring a foldable
2463   // pattern.
2464 
2465   // TODO: We can avoid the neg on the fract? The input sign to fract
2466   // shouldn't matter?
2467   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2468 
2469   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2470 
2471   Register Min = MRI.createGenericVirtualRegister(S64);
2472 
2473   // We don't need to concern ourselves with the snan handling difference, so
2474   // use the one which will directly select.
2475   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2476   if (MFI->getMode().IEEE)
2477     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2478   else
2479     B.buildFMinNum(Min, Fract, Const, Flags);
2480 
2481   Register CorrectedFract = Min;
2482   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2483     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2484     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2485   }
2486 
2487   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2488   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2489 
2490   MI.eraseFromParent();
2491   return true;
2492 }
2493 
2494 // Turn an illegal packed v2s16 build vector into bit operations.
2495 // TODO: This should probably be a bitcast action in LegalizerHelper.
2496 bool AMDGPULegalizerInfo::legalizeBuildVector(
2497   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2498   Register Dst = MI.getOperand(0).getReg();
2499   const LLT S32 = LLT::scalar(32);
2500   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2501 
2502   Register Src0 = MI.getOperand(1).getReg();
2503   Register Src1 = MI.getOperand(2).getReg();
2504   assert(MRI.getType(Src0) == LLT::scalar(16));
2505 
2506   auto Merge = B.buildMerge(S32, {Src0, Src1});
2507   B.buildBitcast(Dst, Merge);
2508 
2509   MI.eraseFromParent();
2510   return true;
2511 }
2512 
2513 // Return the use branch instruction, otherwise null if the usage is invalid.
2514 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2515                                        MachineRegisterInfo &MRI,
2516                                        MachineInstr *&Br,
2517                                        MachineBasicBlock *&UncondBrTarget) {
2518   Register CondDef = MI.getOperand(0).getReg();
2519   if (!MRI.hasOneNonDBGUse(CondDef))
2520     return nullptr;
2521 
2522   MachineBasicBlock *Parent = MI.getParent();
2523   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2524   if (UseMI.getParent() != Parent ||
2525       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2526     return nullptr;
2527 
2528   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2529   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2530   if (Next == Parent->end()) {
2531     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2532     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2533       return nullptr;
2534     UncondBrTarget = &*NextMBB;
2535   } else {
2536     if (Next->getOpcode() != AMDGPU::G_BR)
2537       return nullptr;
2538     Br = &*Next;
2539     UncondBrTarget = Br->getOperand(0).getMBB();
2540   }
2541 
2542   return &UseMI;
2543 }
2544 
2545 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2546                                          const ArgDescriptor *Arg,
2547                                          const TargetRegisterClass *ArgRC,
2548                                          LLT ArgTy) const {
2549   MCRegister SrcReg = Arg->getRegister();
2550   assert(SrcReg.isPhysical() && "Physical register expected");
2551   assert(DstReg.isVirtual() && "Virtual register expected");
2552 
2553   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2554                                              ArgTy);
2555   if (Arg->isMasked()) {
2556     // TODO: Should we try to emit this once in the entry block?
2557     const LLT S32 = LLT::scalar(32);
2558     const unsigned Mask = Arg->getMask();
2559     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2560 
2561     Register AndMaskSrc = LiveIn;
2562 
2563     if (Shift != 0) {
2564       auto ShiftAmt = B.buildConstant(S32, Shift);
2565       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2566     }
2567 
2568     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2569   } else {
2570     B.buildCopy(DstReg, LiveIn);
2571   }
2572 
2573   return true;
2574 }
2575 
2576 bool AMDGPULegalizerInfo::loadInputValue(
2577     Register DstReg, MachineIRBuilder &B,
2578     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2579   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2580   const ArgDescriptor *Arg;
2581   const TargetRegisterClass *ArgRC;
2582   LLT ArgTy;
2583   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2584 
2585   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2586     return false; // TODO: Handle these
2587   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2588 }
2589 
2590 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2591     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2592     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2593   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2594     return false;
2595 
2596   MI.eraseFromParent();
2597   return true;
2598 }
2599 
2600 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2601                                        MachineRegisterInfo &MRI,
2602                                        MachineIRBuilder &B) const {
2603   Register Dst = MI.getOperand(0).getReg();
2604   LLT DstTy = MRI.getType(Dst);
2605   LLT S16 = LLT::scalar(16);
2606   LLT S32 = LLT::scalar(32);
2607   LLT S64 = LLT::scalar(64);
2608 
2609   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2610     return true;
2611 
2612   if (DstTy == S16)
2613     return legalizeFDIV16(MI, MRI, B);
2614   if (DstTy == S32)
2615     return legalizeFDIV32(MI, MRI, B);
2616   if (DstTy == S64)
2617     return legalizeFDIV64(MI, MRI, B);
2618 
2619   return false;
2620 }
2621 
2622 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2623                                                   Register DstReg,
2624                                                   Register X,
2625                                                   Register Y,
2626                                                   bool IsDiv) const {
2627   const LLT S1 = LLT::scalar(1);
2628   const LLT S32 = LLT::scalar(32);
2629 
2630   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2631   // algorithm used here.
2632 
2633   // Initial estimate of inv(y).
2634   auto FloatY = B.buildUITOFP(S32, Y);
2635   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2636   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2637   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2638   auto Z = B.buildFPTOUI(S32, ScaledY);
2639 
2640   // One round of UNR.
2641   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2642   auto NegYZ = B.buildMul(S32, NegY, Z);
2643   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2644 
2645   // Quotient/remainder estimate.
2646   auto Q = B.buildUMulH(S32, X, Z);
2647   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2648 
2649   // First quotient/remainder refinement.
2650   auto One = B.buildConstant(S32, 1);
2651   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2652   if (IsDiv)
2653     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2654   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2655 
2656   // Second quotient/remainder refinement.
2657   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2658   if (IsDiv)
2659     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2660   else
2661     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2662 }
2663 
2664 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2665                                               MachineRegisterInfo &MRI,
2666                                               MachineIRBuilder &B) const {
2667   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2668   Register DstReg = MI.getOperand(0).getReg();
2669   Register Num = MI.getOperand(1).getReg();
2670   Register Den = MI.getOperand(2).getReg();
2671   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2672   MI.eraseFromParent();
2673   return true;
2674 }
2675 
2676 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2677 //
2678 // Return lo, hi of result
2679 //
2680 // %cvt.lo = G_UITOFP Val.lo
2681 // %cvt.hi = G_UITOFP Val.hi
2682 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2683 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2684 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2685 // %mul2 = G_FMUL %mul1, 2**(-32)
2686 // %trunc = G_INTRINSIC_TRUNC %mul2
2687 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2688 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2689 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2690                                                        Register Val) {
2691   const LLT S32 = LLT::scalar(32);
2692   auto Unmerge = B.buildUnmerge(S32, Val);
2693 
2694   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2695   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2696 
2697   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2698                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2699 
2700   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2701   auto Mul1 =
2702       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2703 
2704   // 2**(-32)
2705   auto Mul2 =
2706       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2707   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2708 
2709   // -(2**32)
2710   auto Mad2 = B.buildFMAD(S32, Trunc,
2711                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2712 
2713   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2714   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2715 
2716   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2717 }
2718 
2719 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2720                                                   Register DstReg,
2721                                                   Register Numer,
2722                                                   Register Denom,
2723                                                   bool IsDiv) const {
2724   const LLT S32 = LLT::scalar(32);
2725   const LLT S64 = LLT::scalar(64);
2726   const LLT S1 = LLT::scalar(1);
2727   Register RcpLo, RcpHi;
2728 
2729   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2730 
2731   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2732 
2733   auto Zero64 = B.buildConstant(S64, 0);
2734   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2735 
2736   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2737   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2738 
2739   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2740   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2741   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2742 
2743   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2744   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2745   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2746   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2747 
2748   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2749   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2750   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2751   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2752   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2753 
2754   auto Zero32 = B.buildConstant(S32, 0);
2755   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2756   auto Add2_HiC =
2757       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2758   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2759   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2760 
2761   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2762   Register NumerLo = UnmergeNumer.getReg(0);
2763   Register NumerHi = UnmergeNumer.getReg(1);
2764 
2765   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2766   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2767   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2768   Register Mul3_Lo = UnmergeMul3.getReg(0);
2769   Register Mul3_Hi = UnmergeMul3.getReg(1);
2770   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2771   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2772   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2773   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2774 
2775   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2776   Register DenomLo = UnmergeDenom.getReg(0);
2777   Register DenomHi = UnmergeDenom.getReg(1);
2778 
2779   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2780   auto C1 = B.buildSExt(S32, CmpHi);
2781 
2782   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2783   auto C2 = B.buildSExt(S32, CmpLo);
2784 
2785   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2786   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2787 
2788   // TODO: Here and below portions of the code can be enclosed into if/endif.
2789   // Currently control flow is unconditional and we have 4 selects after
2790   // potential endif to substitute PHIs.
2791 
2792   // if C3 != 0 ...
2793   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2794   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2795   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2796   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2797 
2798   auto One64 = B.buildConstant(S64, 1);
2799   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2800 
2801   auto C4 =
2802       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2803   auto C5 =
2804       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2805   auto C6 = B.buildSelect(
2806       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2807 
2808   // if (C6 != 0)
2809   auto Add4 = B.buildAdd(S64, Add3, One64);
2810   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2811 
2812   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2813   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2814   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2815 
2816   // endif C6
2817   // endif C3
2818 
2819   if (IsDiv) {
2820     auto Sel1 = B.buildSelect(
2821         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2822     B.buildSelect(DstReg,
2823                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2824   } else {
2825     auto Sel2 = B.buildSelect(
2826         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2827     B.buildSelect(DstReg,
2828                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2829   }
2830 }
2831 
2832 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2833                                             MachineRegisterInfo &MRI,
2834                                             MachineIRBuilder &B) const {
2835   const LLT S64 = LLT::scalar(64);
2836   const LLT S32 = LLT::scalar(32);
2837   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2838   Register DstReg = MI.getOperand(0).getReg();
2839   Register Num = MI.getOperand(1).getReg();
2840   Register Den = MI.getOperand(2).getReg();
2841   LLT Ty = MRI.getType(DstReg);
2842 
2843   if (Ty == S32)
2844     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2845   else if (Ty == S64)
2846     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2847   else
2848     return false;
2849 
2850   MI.eraseFromParent();
2851   return true;
2852 
2853 }
2854 
2855 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2856                                             MachineRegisterInfo &MRI,
2857                                             MachineIRBuilder &B) const {
2858   const LLT S64 = LLT::scalar(64);
2859   const LLT S32 = LLT::scalar(32);
2860 
2861   Register DstReg = MI.getOperand(0).getReg();
2862   const LLT Ty = MRI.getType(DstReg);
2863   if (Ty != S32 && Ty != S64)
2864     return false;
2865 
2866   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2867 
2868   Register LHS = MI.getOperand(1).getReg();
2869   Register RHS = MI.getOperand(2).getReg();
2870 
2871   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2872   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2873   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2874 
2875   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2876   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2877 
2878   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2879   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2880 
2881   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2882   if (Ty == S32)
2883     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2884   else
2885     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2886 
2887   Register Sign;
2888   if (IsDiv)
2889     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2890   else
2891     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2892 
2893   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2894   B.buildSub(DstReg, UDivRem, Sign);
2895 
2896   MI.eraseFromParent();
2897   return true;
2898 }
2899 
2900 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2901                                                  MachineRegisterInfo &MRI,
2902                                                  MachineIRBuilder &B) const {
2903   Register Res = MI.getOperand(0).getReg();
2904   Register LHS = MI.getOperand(1).getReg();
2905   Register RHS = MI.getOperand(2).getReg();
2906 
2907   uint16_t Flags = MI.getFlags();
2908 
2909   LLT ResTy = MRI.getType(Res);
2910   LLT S32 = LLT::scalar(32);
2911   LLT S64 = LLT::scalar(64);
2912 
2913   const MachineFunction &MF = B.getMF();
2914   bool Unsafe =
2915     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2916 
2917   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2918     return false;
2919 
2920   if (!Unsafe && ResTy == S32 &&
2921       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2922     return false;
2923 
2924   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2925     // 1 / x -> RCP(x)
2926     if (CLHS->isExactlyValue(1.0)) {
2927       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2928         .addUse(RHS)
2929         .setMIFlags(Flags);
2930 
2931       MI.eraseFromParent();
2932       return true;
2933     }
2934 
2935     // -1 / x -> RCP( FNEG(x) )
2936     if (CLHS->isExactlyValue(-1.0)) {
2937       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2938       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2939         .addUse(FNeg.getReg(0))
2940         .setMIFlags(Flags);
2941 
2942       MI.eraseFromParent();
2943       return true;
2944     }
2945   }
2946 
2947   // x / y -> x * (1.0 / y)
2948   if (Unsafe) {
2949     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2950       .addUse(RHS)
2951       .setMIFlags(Flags);
2952     B.buildFMul(Res, LHS, RCP, Flags);
2953 
2954     MI.eraseFromParent();
2955     return true;
2956   }
2957 
2958   return false;
2959 }
2960 
2961 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2962                                          MachineRegisterInfo &MRI,
2963                                          MachineIRBuilder &B) const {
2964   Register Res = MI.getOperand(0).getReg();
2965   Register LHS = MI.getOperand(1).getReg();
2966   Register RHS = MI.getOperand(2).getReg();
2967 
2968   uint16_t Flags = MI.getFlags();
2969 
2970   LLT S16 = LLT::scalar(16);
2971   LLT S32 = LLT::scalar(32);
2972 
2973   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2974   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2975 
2976   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2977     .addUse(RHSExt.getReg(0))
2978     .setMIFlags(Flags);
2979 
2980   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2981   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2982 
2983   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2984     .addUse(RDst.getReg(0))
2985     .addUse(RHS)
2986     .addUse(LHS)
2987     .setMIFlags(Flags);
2988 
2989   MI.eraseFromParent();
2990   return true;
2991 }
2992 
2993 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2994 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2995 static void toggleSPDenormMode(bool Enable,
2996                                MachineIRBuilder &B,
2997                                const GCNSubtarget &ST,
2998                                AMDGPU::SIModeRegisterDefaults Mode) {
2999   // Set SP denorm mode to this value.
3000   unsigned SPDenormMode =
3001     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3002 
3003   if (ST.hasDenormModeInst()) {
3004     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3005     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3006 
3007     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3008     B.buildInstr(AMDGPU::S_DENORM_MODE)
3009       .addImm(NewDenormModeValue);
3010 
3011   } else {
3012     // Select FP32 bit field in mode register.
3013     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3014                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3015                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3016 
3017     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3018       .addImm(SPDenormMode)
3019       .addImm(SPDenormModeBitField);
3020   }
3021 }
3022 
3023 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3024                                          MachineRegisterInfo &MRI,
3025                                          MachineIRBuilder &B) const {
3026   Register Res = MI.getOperand(0).getReg();
3027   Register LHS = MI.getOperand(1).getReg();
3028   Register RHS = MI.getOperand(2).getReg();
3029   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3030   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3031 
3032   uint16_t Flags = MI.getFlags();
3033 
3034   LLT S32 = LLT::scalar(32);
3035   LLT S1 = LLT::scalar(1);
3036 
3037   auto One = B.buildFConstant(S32, 1.0f);
3038 
3039   auto DenominatorScaled =
3040     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3041       .addUse(LHS)
3042       .addUse(RHS)
3043       .addImm(0)
3044       .setMIFlags(Flags);
3045   auto NumeratorScaled =
3046     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3047       .addUse(LHS)
3048       .addUse(RHS)
3049       .addImm(1)
3050       .setMIFlags(Flags);
3051 
3052   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3053     .addUse(DenominatorScaled.getReg(0))
3054     .setMIFlags(Flags);
3055   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3056 
3057   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3058   // aren't modeled as reading it.
3059   if (!Mode.allFP32Denormals())
3060     toggleSPDenormMode(true, B, ST, Mode);
3061 
3062   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3063   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3064   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3065   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3066   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3067   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3068 
3069   if (!Mode.allFP32Denormals())
3070     toggleSPDenormMode(false, B, ST, Mode);
3071 
3072   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3073     .addUse(Fma4.getReg(0))
3074     .addUse(Fma1.getReg(0))
3075     .addUse(Fma3.getReg(0))
3076     .addUse(NumeratorScaled.getReg(1))
3077     .setMIFlags(Flags);
3078 
3079   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3080     .addUse(Fmas.getReg(0))
3081     .addUse(RHS)
3082     .addUse(LHS)
3083     .setMIFlags(Flags);
3084 
3085   MI.eraseFromParent();
3086   return true;
3087 }
3088 
3089 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3090                                          MachineRegisterInfo &MRI,
3091                                          MachineIRBuilder &B) const {
3092   Register Res = MI.getOperand(0).getReg();
3093   Register LHS = MI.getOperand(1).getReg();
3094   Register RHS = MI.getOperand(2).getReg();
3095 
3096   uint16_t Flags = MI.getFlags();
3097 
3098   LLT S64 = LLT::scalar(64);
3099   LLT S1 = LLT::scalar(1);
3100 
3101   auto One = B.buildFConstant(S64, 1.0);
3102 
3103   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3104     .addUse(LHS)
3105     .addUse(RHS)
3106     .addImm(0)
3107     .setMIFlags(Flags);
3108 
3109   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3110 
3111   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3112     .addUse(DivScale0.getReg(0))
3113     .setMIFlags(Flags);
3114 
3115   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3116   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3117   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3118 
3119   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3120     .addUse(LHS)
3121     .addUse(RHS)
3122     .addImm(1)
3123     .setMIFlags(Flags);
3124 
3125   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3126   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3127   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3128 
3129   Register Scale;
3130   if (!ST.hasUsableDivScaleConditionOutput()) {
3131     // Workaround a hardware bug on SI where the condition output from div_scale
3132     // is not usable.
3133 
3134     LLT S32 = LLT::scalar(32);
3135 
3136     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3137     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3138     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3139     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3140 
3141     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3142                               Scale1Unmerge.getReg(1));
3143     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3144                               Scale0Unmerge.getReg(1));
3145     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3146   } else {
3147     Scale = DivScale1.getReg(1);
3148   }
3149 
3150   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3151     .addUse(Fma4.getReg(0))
3152     .addUse(Fma3.getReg(0))
3153     .addUse(Mul.getReg(0))
3154     .addUse(Scale)
3155     .setMIFlags(Flags);
3156 
3157   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3158     .addUse(Fmas.getReg(0))
3159     .addUse(RHS)
3160     .addUse(LHS)
3161     .setMIFlags(Flags);
3162 
3163   MI.eraseFromParent();
3164   return true;
3165 }
3166 
3167 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3168                                                  MachineRegisterInfo &MRI,
3169                                                  MachineIRBuilder &B) const {
3170   Register Res = MI.getOperand(0).getReg();
3171   Register LHS = MI.getOperand(2).getReg();
3172   Register RHS = MI.getOperand(3).getReg();
3173   uint16_t Flags = MI.getFlags();
3174 
3175   LLT S32 = LLT::scalar(32);
3176   LLT S1 = LLT::scalar(1);
3177 
3178   auto Abs = B.buildFAbs(S32, RHS, Flags);
3179   const APFloat C0Val(1.0f);
3180 
3181   auto C0 = B.buildConstant(S32, 0x6f800000);
3182   auto C1 = B.buildConstant(S32, 0x2f800000);
3183   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3184 
3185   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3186   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3187 
3188   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3189 
3190   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3191     .addUse(Mul0.getReg(0))
3192     .setMIFlags(Flags);
3193 
3194   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3195 
3196   B.buildFMul(Res, Sel, Mul1, Flags);
3197 
3198   MI.eraseFromParent();
3199   return true;
3200 }
3201 
3202 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3203 // FIXME: Why do we handle this one but not other removed instructions?
3204 //
3205 // Reciprocal square root.  The clamp prevents infinite results, clamping
3206 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3207 // +-max_float.
3208 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3209                                                     MachineRegisterInfo &MRI,
3210                                                     MachineIRBuilder &B) const {
3211   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3212     return true;
3213 
3214   Register Dst = MI.getOperand(0).getReg();
3215   Register Src = MI.getOperand(2).getReg();
3216   auto Flags = MI.getFlags();
3217 
3218   LLT Ty = MRI.getType(Dst);
3219 
3220   const fltSemantics *FltSemantics;
3221   if (Ty == LLT::scalar(32))
3222     FltSemantics = &APFloat::IEEEsingle();
3223   else if (Ty == LLT::scalar(64))
3224     FltSemantics = &APFloat::IEEEdouble();
3225   else
3226     return false;
3227 
3228   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3229     .addUse(Src)
3230     .setMIFlags(Flags);
3231 
3232   // We don't need to concern ourselves with the snan handling difference, since
3233   // the rsq quieted (or not) so use the one which will directly select.
3234   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3235   const bool UseIEEE = MFI->getMode().IEEE;
3236 
3237   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3238   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3239                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3240 
3241   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3242 
3243   if (UseIEEE)
3244     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3245   else
3246     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3247   MI.eraseFromParent();
3248   return true;
3249 }
3250 
3251 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3252   switch (IID) {
3253   case Intrinsic::amdgcn_ds_fadd:
3254     return AMDGPU::G_ATOMICRMW_FADD;
3255   case Intrinsic::amdgcn_ds_fmin:
3256     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3257   case Intrinsic::amdgcn_ds_fmax:
3258     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3259   default:
3260     llvm_unreachable("not a DS FP intrinsic");
3261   }
3262 }
3263 
3264 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3265                                                       MachineInstr &MI,
3266                                                       Intrinsic::ID IID) const {
3267   GISelChangeObserver &Observer = Helper.Observer;
3268   Observer.changingInstr(MI);
3269 
3270   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3271 
3272   // The remaining operands were used to set fields in the MemOperand on
3273   // construction.
3274   for (int I = 6; I > 3; --I)
3275     MI.RemoveOperand(I);
3276 
3277   MI.RemoveOperand(1); // Remove the intrinsic ID.
3278   Observer.changedInstr(MI);
3279   return true;
3280 }
3281 
3282 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3283                                             MachineRegisterInfo &MRI,
3284                                             MachineIRBuilder &B) const {
3285   uint64_t Offset =
3286     ST.getTargetLowering()->getImplicitParameterOffset(
3287       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3288   LLT DstTy = MRI.getType(DstReg);
3289   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3290 
3291   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3292   if (!loadInputValue(KernargPtrReg, B,
3293                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3294     return false;
3295 
3296   // FIXME: This should be nuw
3297   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3298   return true;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3302                                                  MachineRegisterInfo &MRI,
3303                                                  MachineIRBuilder &B) const {
3304   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3305   if (!MFI->isEntryFunction()) {
3306     return legalizePreloadedArgIntrin(MI, MRI, B,
3307                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3308   }
3309 
3310   Register DstReg = MI.getOperand(0).getReg();
3311   if (!getImplicitArgPtr(DstReg, MRI, B))
3312     return false;
3313 
3314   MI.eraseFromParent();
3315   return true;
3316 }
3317 
3318 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3319                                               MachineRegisterInfo &MRI,
3320                                               MachineIRBuilder &B,
3321                                               unsigned AddrSpace) const {
3322   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3323   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3324   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3325   MI.eraseFromParent();
3326   return true;
3327 }
3328 
3329 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3330 // offset (the offset that is included in bounds checking and swizzling, to be
3331 // split between the instruction's voffset and immoffset fields) and soffset
3332 // (the offset that is excluded from bounds checking and swizzling, to go in
3333 // the instruction's soffset field).  This function takes the first kind of
3334 // offset and figures out how to split it between voffset and immoffset.
3335 std::tuple<Register, unsigned, unsigned>
3336 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3337                                         Register OrigOffset) const {
3338   const unsigned MaxImm = 4095;
3339   Register BaseReg;
3340   unsigned TotalConstOffset;
3341   MachineInstr *OffsetDef;
3342   const LLT S32 = LLT::scalar(32);
3343 
3344   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3345     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3346 
3347   unsigned ImmOffset = TotalConstOffset;
3348 
3349   // If the immediate value is too big for the immoffset field, put the value
3350   // and -4096 into the immoffset field so that the value that is copied/added
3351   // for the voffset field is a multiple of 4096, and it stands more chance
3352   // of being CSEd with the copy/add for another similar load/store.
3353   // However, do not do that rounding down to a multiple of 4096 if that is a
3354   // negative number, as it appears to be illegal to have a negative offset
3355   // in the vgpr, even if adding the immediate offset makes it positive.
3356   unsigned Overflow = ImmOffset & ~MaxImm;
3357   ImmOffset -= Overflow;
3358   if ((int32_t)Overflow < 0) {
3359     Overflow += ImmOffset;
3360     ImmOffset = 0;
3361   }
3362 
3363   if (Overflow != 0) {
3364     if (!BaseReg) {
3365       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3366     } else {
3367       auto OverflowVal = B.buildConstant(S32, Overflow);
3368       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3369     }
3370   }
3371 
3372   if (!BaseReg)
3373     BaseReg = B.buildConstant(S32, 0).getReg(0);
3374 
3375   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3376 }
3377 
3378 /// Handle register layout difference for f16 images for some subtargets.
3379 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3380                                              MachineRegisterInfo &MRI,
3381                                              Register Reg) const {
3382   if (!ST.hasUnpackedD16VMem())
3383     return Reg;
3384 
3385   const LLT S16 = LLT::scalar(16);
3386   const LLT S32 = LLT::scalar(32);
3387   LLT StoreVT = MRI.getType(Reg);
3388   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3389 
3390   auto Unmerge = B.buildUnmerge(S16, Reg);
3391 
3392   SmallVector<Register, 4> WideRegs;
3393   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3394     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3395 
3396   int NumElts = StoreVT.getNumElements();
3397 
3398   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3399 }
3400 
3401 Register AMDGPULegalizerInfo::fixStoreSourceType(
3402   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3403   MachineRegisterInfo *MRI = B.getMRI();
3404   LLT Ty = MRI->getType(VData);
3405 
3406   const LLT S16 = LLT::scalar(16);
3407 
3408   // Fixup illegal register types for i8 stores.
3409   if (Ty == LLT::scalar(8) || Ty == S16) {
3410     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3411     return AnyExt;
3412   }
3413 
3414   if (Ty.isVector()) {
3415     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3416       if (IsFormat)
3417         return handleD16VData(B, *MRI, VData);
3418     }
3419   }
3420 
3421   return VData;
3422 }
3423 
3424 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3425                                               MachineRegisterInfo &MRI,
3426                                               MachineIRBuilder &B,
3427                                               bool IsTyped,
3428                                               bool IsFormat) const {
3429   Register VData = MI.getOperand(1).getReg();
3430   LLT Ty = MRI.getType(VData);
3431   LLT EltTy = Ty.getScalarType();
3432   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3433   const LLT S32 = LLT::scalar(32);
3434 
3435   VData = fixStoreSourceType(B, VData, IsFormat);
3436   Register RSrc = MI.getOperand(2).getReg();
3437 
3438   MachineMemOperand *MMO = *MI.memoperands_begin();
3439   const int MemSize = MMO->getSize();
3440 
3441   unsigned ImmOffset;
3442   unsigned TotalOffset;
3443 
3444   // The typed intrinsics add an immediate after the registers.
3445   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3446 
3447   // The struct intrinsic variants add one additional operand over raw.
3448   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3449   Register VIndex;
3450   int OpOffset = 0;
3451   if (HasVIndex) {
3452     VIndex = MI.getOperand(3).getReg();
3453     OpOffset = 1;
3454   }
3455 
3456   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3457   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3458 
3459   unsigned Format = 0;
3460   if (IsTyped) {
3461     Format = MI.getOperand(5 + OpOffset).getImm();
3462     ++OpOffset;
3463   }
3464 
3465   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3466 
3467   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3468   if (TotalOffset != 0)
3469     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3470 
3471   unsigned Opc;
3472   if (IsTyped) {
3473     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3474                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3475   } else if (IsFormat) {
3476     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3477                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3478   } else {
3479     switch (MemSize) {
3480     case 1:
3481       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3482       break;
3483     case 2:
3484       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3485       break;
3486     default:
3487       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3488       break;
3489     }
3490   }
3491 
3492   if (!VIndex)
3493     VIndex = B.buildConstant(S32, 0).getReg(0);
3494 
3495   auto MIB = B.buildInstr(Opc)
3496     .addUse(VData)              // vdata
3497     .addUse(RSrc)               // rsrc
3498     .addUse(VIndex)             // vindex
3499     .addUse(VOffset)            // voffset
3500     .addUse(SOffset)            // soffset
3501     .addImm(ImmOffset);         // offset(imm)
3502 
3503   if (IsTyped)
3504     MIB.addImm(Format);
3505 
3506   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3507      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3508      .addMemOperand(MMO);
3509 
3510   MI.eraseFromParent();
3511   return true;
3512 }
3513 
3514 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3515                                              MachineRegisterInfo &MRI,
3516                                              MachineIRBuilder &B,
3517                                              bool IsFormat,
3518                                              bool IsTyped) const {
3519   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3520   MachineMemOperand *MMO = *MI.memoperands_begin();
3521   const int MemSize = MMO->getSize();
3522   const LLT S32 = LLT::scalar(32);
3523 
3524   Register Dst = MI.getOperand(0).getReg();
3525   Register RSrc = MI.getOperand(2).getReg();
3526 
3527   // The typed intrinsics add an immediate after the registers.
3528   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3529 
3530   // The struct intrinsic variants add one additional operand over raw.
3531   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3532   Register VIndex;
3533   int OpOffset = 0;
3534   if (HasVIndex) {
3535     VIndex = MI.getOperand(3).getReg();
3536     OpOffset = 1;
3537   }
3538 
3539   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3540   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3541 
3542   unsigned Format = 0;
3543   if (IsTyped) {
3544     Format = MI.getOperand(5 + OpOffset).getImm();
3545     ++OpOffset;
3546   }
3547 
3548   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3549   unsigned ImmOffset;
3550   unsigned TotalOffset;
3551 
3552   LLT Ty = MRI.getType(Dst);
3553   LLT EltTy = Ty.getScalarType();
3554   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3555   const bool Unpacked = ST.hasUnpackedD16VMem();
3556 
3557   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3558   if (TotalOffset != 0)
3559     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3560 
3561   unsigned Opc;
3562 
3563   if (IsTyped) {
3564     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3565                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3566   } else if (IsFormat) {
3567     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3568                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3569   } else {
3570     switch (MemSize) {
3571     case 1:
3572       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3573       break;
3574     case 2:
3575       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3576       break;
3577     default:
3578       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3579       break;
3580     }
3581   }
3582 
3583   Register LoadDstReg;
3584 
3585   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3586   LLT UnpackedTy = Ty.changeElementSize(32);
3587 
3588   if (IsExtLoad)
3589     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3590   else if (Unpacked && IsD16 && Ty.isVector())
3591     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3592   else
3593     LoadDstReg = Dst;
3594 
3595   if (!VIndex)
3596     VIndex = B.buildConstant(S32, 0).getReg(0);
3597 
3598   auto MIB = B.buildInstr(Opc)
3599     .addDef(LoadDstReg)         // vdata
3600     .addUse(RSrc)               // rsrc
3601     .addUse(VIndex)             // vindex
3602     .addUse(VOffset)            // voffset
3603     .addUse(SOffset)            // soffset
3604     .addImm(ImmOffset);         // offset(imm)
3605 
3606   if (IsTyped)
3607     MIB.addImm(Format);
3608 
3609   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3610      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3611      .addMemOperand(MMO);
3612 
3613   if (LoadDstReg != Dst) {
3614     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3615 
3616     // Widen result for extending loads was widened.
3617     if (IsExtLoad)
3618       B.buildTrunc(Dst, LoadDstReg);
3619     else {
3620       // Repack to original 16-bit vector result
3621       // FIXME: G_TRUNC should work, but legalization currently fails
3622       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3623       SmallVector<Register, 4> Repack;
3624       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3625         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3626       B.buildMerge(Dst, Repack);
3627     }
3628   }
3629 
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
3634 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3635                                                MachineIRBuilder &B,
3636                                                bool IsInc) const {
3637   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3638                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3639   B.buildInstr(Opc)
3640     .addDef(MI.getOperand(0).getReg())
3641     .addUse(MI.getOperand(2).getReg())
3642     .addUse(MI.getOperand(3).getReg())
3643     .cloneMemRefs(MI);
3644   MI.eraseFromParent();
3645   return true;
3646 }
3647 
3648 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3649   switch (IntrID) {
3650   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3651   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3652     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3653   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3654   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3655     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3656   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3657   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3658     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3659   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3660   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3661     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3662   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3663   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3664     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3665   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3666   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3667     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3668   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3669   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3670     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3671   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3672   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3673     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3674   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3675   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3676     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3677   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3678   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3679     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3680   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3681   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3682     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3683   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3684   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3685     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3686   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3687   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3688     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3689   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3690   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3691     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3692   default:
3693     llvm_unreachable("unhandled atomic opcode");
3694   }
3695 }
3696 
3697 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3698                                                MachineIRBuilder &B,
3699                                                Intrinsic::ID IID) const {
3700   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3701                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3702   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3703 
3704   Register Dst;
3705 
3706   int OpOffset = 0;
3707   if (HasReturn) {
3708     // A few FP atomics do not support return values.
3709     Dst = MI.getOperand(0).getReg();
3710   } else {
3711     OpOffset = -1;
3712   }
3713 
3714   Register VData = MI.getOperand(2 + OpOffset).getReg();
3715   Register CmpVal;
3716 
3717   if (IsCmpSwap) {
3718     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3719     ++OpOffset;
3720   }
3721 
3722   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3723   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3724 
3725   // The struct intrinsic variants add one additional operand over raw.
3726   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3727   Register VIndex;
3728   if (HasVIndex) {
3729     VIndex = MI.getOperand(4 + OpOffset).getReg();
3730     ++OpOffset;
3731   }
3732 
3733   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3734   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3735   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3736 
3737   MachineMemOperand *MMO = *MI.memoperands_begin();
3738 
3739   unsigned ImmOffset;
3740   unsigned TotalOffset;
3741   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3742   if (TotalOffset != 0)
3743     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3744 
3745   if (!VIndex)
3746     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3747 
3748   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3749 
3750   if (HasReturn)
3751     MIB.addDef(Dst);
3752 
3753   MIB.addUse(VData); // vdata
3754 
3755   if (IsCmpSwap)
3756     MIB.addReg(CmpVal);
3757 
3758   MIB.addUse(RSrc)               // rsrc
3759      .addUse(VIndex)             // vindex
3760      .addUse(VOffset)            // voffset
3761      .addUse(SOffset)            // soffset
3762      .addImm(ImmOffset)          // offset(imm)
3763      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3764      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3765      .addMemOperand(MMO);
3766 
3767   MI.eraseFromParent();
3768   return true;
3769 }
3770 
3771 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3772 /// vector with s16 typed elements.
3773 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3774                                         SmallVectorImpl<Register> &PackedAddrs,
3775                                         int AddrIdx, int DimIdx, int EndIdx,
3776                                         int NumGradients) {
3777   const LLT S16 = LLT::scalar(16);
3778   const LLT V2S16 = LLT::vector(2, 16);
3779 
3780   for (int I = AddrIdx; I < EndIdx; ++I) {
3781     MachineOperand &SrcOp = MI.getOperand(I);
3782     if (!SrcOp.isReg())
3783       continue; // _L to _LZ may have eliminated this.
3784 
3785     Register AddrReg = SrcOp.getReg();
3786 
3787     if (I < DimIdx) {
3788       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3789       PackedAddrs.push_back(AddrReg);
3790     } else {
3791       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3792       // derivatives dx/dh and dx/dv are packed with undef.
3793       if (((I + 1) >= EndIdx) ||
3794           ((NumGradients / 2) % 2 == 1 &&
3795            (I == DimIdx + (NumGradients / 2) - 1 ||
3796             I == DimIdx + NumGradients - 1)) ||
3797           // Check for _L to _LZ optimization
3798           !MI.getOperand(I + 1).isReg()) {
3799         PackedAddrs.push_back(
3800             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3801                 .getReg(0));
3802       } else {
3803         PackedAddrs.push_back(
3804             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3805                 .getReg(0));
3806         ++I;
3807       }
3808     }
3809   }
3810 }
3811 
3812 /// Convert from separate vaddr components to a single vector address register,
3813 /// and replace the remaining operands with $noreg.
3814 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3815                                      int DimIdx, int NumVAddrs) {
3816   const LLT S32 = LLT::scalar(32);
3817 
3818   SmallVector<Register, 8> AddrRegs;
3819   for (int I = 0; I != NumVAddrs; ++I) {
3820     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3821     if (SrcOp.isReg()) {
3822       AddrRegs.push_back(SrcOp.getReg());
3823       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3824     }
3825   }
3826 
3827   int NumAddrRegs = AddrRegs.size();
3828   if (NumAddrRegs != 1) {
3829     // Round up to 8 elements for v5-v7
3830     // FIXME: Missing intermediate sized register classes and instructions.
3831     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3832       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3833       auto Undef = B.buildUndef(S32);
3834       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3835       NumAddrRegs = RoundedNumRegs;
3836     }
3837 
3838     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3839     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3840   }
3841 
3842   for (int I = 1; I != NumVAddrs; ++I) {
3843     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3844     if (SrcOp.isReg())
3845       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3846   }
3847 }
3848 
3849 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3850 ///
3851 /// Depending on the subtarget, load/store with 16-bit element data need to be
3852 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3853 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3854 /// registers.
3855 ///
3856 /// We don't want to directly select image instructions just yet, but also want
3857 /// to exposes all register repacking to the legalizer/combiners. We also don't
3858 /// want a selected instrution entering RegBankSelect. In order to avoid
3859 /// defining a multitude of intermediate image instructions, directly hack on
3860 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3861 /// now unnecessary arguments with $noreg.
3862 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3863     MachineInstr &MI, MachineIRBuilder &B,
3864     GISelChangeObserver &Observer,
3865     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3866 
3867   const int NumDefs = MI.getNumExplicitDefs();
3868   bool IsTFE = NumDefs == 2;
3869   // We are only processing the operands of d16 image operations on subtargets
3870   // that use the unpacked register layout, or need to repack the TFE result.
3871 
3872   // TODO: Do we need to guard against already legalized intrinsics?
3873   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3874     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3875 
3876   MachineRegisterInfo *MRI = B.getMRI();
3877   const LLT S32 = LLT::scalar(32);
3878   const LLT S16 = LLT::scalar(16);
3879   const LLT V2S16 = LLT::vector(2, 16);
3880 
3881   // Index of first address argument
3882   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3883 
3884   int NumVAddrs, NumGradients;
3885   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3886   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3887     getDMaskIdx(BaseOpcode, NumDefs);
3888   unsigned DMask = 0;
3889 
3890   // Check for 16 bit addresses and pack if true.
3891   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3892   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3893   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3894   const bool IsG16 = GradTy == S16;
3895   const bool IsA16 = AddrTy == S16;
3896 
3897   int DMaskLanes = 0;
3898   if (!BaseOpcode->Atomic) {
3899     DMask = MI.getOperand(DMaskIdx).getImm();
3900     if (BaseOpcode->Gather4) {
3901       DMaskLanes = 4;
3902     } else if (DMask != 0) {
3903       DMaskLanes = countPopulation(DMask);
3904     } else if (!IsTFE && !BaseOpcode->Store) {
3905       // If dmask is 0, this is a no-op load. This can be eliminated.
3906       B.buildUndef(MI.getOperand(0));
3907       MI.eraseFromParent();
3908       return true;
3909     }
3910   }
3911 
3912   Observer.changingInstr(MI);
3913   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3914 
3915   unsigned NewOpcode = NumDefs == 0 ?
3916     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3917 
3918   // Track that we legalized this
3919   MI.setDesc(B.getTII().get(NewOpcode));
3920 
3921   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3922   // dmask to be at least 1 otherwise the instruction will fail
3923   if (IsTFE && DMask == 0) {
3924     DMask = 0x1;
3925     DMaskLanes = 1;
3926     MI.getOperand(DMaskIdx).setImm(DMask);
3927   }
3928 
3929   if (BaseOpcode->Atomic) {
3930     Register VData0 = MI.getOperand(2).getReg();
3931     LLT Ty = MRI->getType(VData0);
3932 
3933     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3934     if (Ty.isVector())
3935       return false;
3936 
3937     if (BaseOpcode->AtomicX2) {
3938       Register VData1 = MI.getOperand(3).getReg();
3939       // The two values are packed in one register.
3940       LLT PackedTy = LLT::vector(2, Ty);
3941       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3942       MI.getOperand(2).setReg(Concat.getReg(0));
3943       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3944     }
3945   }
3946 
3947   int CorrectedNumVAddrs = NumVAddrs;
3948 
3949   // Optimize _L to _LZ when _L is zero
3950   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3951         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3952     const ConstantFP *ConstantLod;
3953     const int LodIdx = AddrIdx + NumVAddrs - 1;
3954 
3955     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3956       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3957         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3958         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3959           LZMappingInfo->LZ, ImageDimIntr->Dim);
3960 
3961         // The starting indexes should remain in the same place.
3962         --NumVAddrs;
3963         --CorrectedNumVAddrs;
3964 
3965         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3966           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3967         MI.RemoveOperand(LodIdx);
3968       }
3969     }
3970   }
3971 
3972   // Optimize _mip away, when 'lod' is zero
3973   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3974     int64_t ConstantLod;
3975     const int LodIdx = AddrIdx + NumVAddrs - 1;
3976 
3977     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3978       if (ConstantLod == 0) {
3979         // TODO: Change intrinsic opcode and remove operand instead or replacing
3980         // it with 0, as the _L to _LZ handling is done above.
3981         MI.getOperand(LodIdx).ChangeToImmediate(0);
3982         --CorrectedNumVAddrs;
3983       }
3984     }
3985   }
3986 
3987   // Rewrite the addressing register layout before doing anything else.
3988   if (IsA16 || IsG16) {
3989     if (IsA16) {
3990       // Target must support the feature and gradients need to be 16 bit too
3991       if (!ST.hasA16() || !IsG16)
3992         return false;
3993     } else if (!ST.hasG16())
3994       return false;
3995 
3996     if (NumVAddrs > 1) {
3997       SmallVector<Register, 4> PackedRegs;
3998       // Don't compress addresses for G16
3999       const int PackEndIdx =
4000           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4001       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4002                                   PackEndIdx, NumGradients);
4003 
4004       if (!IsA16) {
4005         // Add uncompressed address
4006         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4007           int AddrReg = MI.getOperand(I).getReg();
4008           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4009           PackedRegs.push_back(AddrReg);
4010         }
4011       }
4012 
4013       // See also below in the non-a16 branch
4014       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4015 
4016       if (!UseNSA && PackedRegs.size() > 1) {
4017         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4018         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4019         PackedRegs[0] = Concat.getReg(0);
4020         PackedRegs.resize(1);
4021       }
4022 
4023       const int NumPacked = PackedRegs.size();
4024       for (int I = 0; I != NumVAddrs; ++I) {
4025         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4026         if (!SrcOp.isReg()) {
4027           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4028           continue;
4029         }
4030 
4031         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4032 
4033         if (I < NumPacked)
4034           SrcOp.setReg(PackedRegs[I]);
4035         else
4036           SrcOp.setReg(AMDGPU::NoRegister);
4037       }
4038     }
4039   } else {
4040     // If the register allocator cannot place the address registers contiguously
4041     // without introducing moves, then using the non-sequential address encoding
4042     // is always preferable, since it saves VALU instructions and is usually a
4043     // wash in terms of code size or even better.
4044     //
4045     // However, we currently have no way of hinting to the register allocator
4046     // that MIMG addresses should be placed contiguously when it is possible to
4047     // do so, so force non-NSA for the common 2-address case as a heuristic.
4048     //
4049     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4050     // allocation when possible.
4051     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4052 
4053     if (!UseNSA && NumVAddrs > 1)
4054       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4055   }
4056 
4057   int Flags = 0;
4058   if (IsA16)
4059     Flags |= 1;
4060   if (IsG16)
4061     Flags |= 2;
4062   MI.addOperand(MachineOperand::CreateImm(Flags));
4063 
4064   if (BaseOpcode->Store) { // No TFE for stores?
4065     // TODO: Handle dmask trim
4066     Register VData = MI.getOperand(1).getReg();
4067     LLT Ty = MRI->getType(VData);
4068     if (!Ty.isVector() || Ty.getElementType() != S16)
4069       return true;
4070 
4071     Register RepackedReg = handleD16VData(B, *MRI, VData);
4072     if (RepackedReg != VData) {
4073       MI.getOperand(1).setReg(RepackedReg);
4074     }
4075 
4076     return true;
4077   }
4078 
4079   Register DstReg = MI.getOperand(0).getReg();
4080   LLT Ty = MRI->getType(DstReg);
4081   const LLT EltTy = Ty.getScalarType();
4082   const bool IsD16 = Ty.getScalarType() == S16;
4083   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4084 
4085   // Confirm that the return type is large enough for the dmask specified
4086   if (NumElts < DMaskLanes)
4087     return false;
4088 
4089   if (NumElts > 4 || DMaskLanes > 4)
4090     return false;
4091 
4092   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4093   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4094 
4095   // The raw dword aligned data component of the load. The only legal cases
4096   // where this matters should be when using the packed D16 format, for
4097   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4098   LLT RoundedTy;
4099 
4100   // S32 vector to to cover all data, plus TFE result element.
4101   LLT TFETy;
4102 
4103   // Register type to use for each loaded component. Will be S32 or V2S16.
4104   LLT RegTy;
4105 
4106   if (IsD16 && ST.hasUnpackedD16VMem()) {
4107     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4108     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4109     RegTy = S32;
4110   } else {
4111     unsigned EltSize = EltTy.getSizeInBits();
4112     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4113     unsigned RoundedSize = 32 * RoundedElts;
4114     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4115     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4116     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4117   }
4118 
4119   // The return type does not need adjustment.
4120   // TODO: Should we change s16 case to s32 or <2 x s16>?
4121   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4122     return true;
4123 
4124   Register Dst1Reg;
4125 
4126   // Insert after the instruction.
4127   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4128 
4129   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4130   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4131   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4132   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4133 
4134   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4135 
4136   MI.getOperand(0).setReg(NewResultReg);
4137 
4138   // In the IR, TFE is supposed to be used with a 2 element struct return
4139   // type. The intruction really returns these two values in one contiguous
4140   // register, with one additional dword beyond the loaded data. Rewrite the
4141   // return type to use a single register result.
4142 
4143   if (IsTFE) {
4144     Dst1Reg = MI.getOperand(1).getReg();
4145     if (MRI->getType(Dst1Reg) != S32)
4146       return false;
4147 
4148     // TODO: Make sure the TFE operand bit is set.
4149     MI.RemoveOperand(1);
4150 
4151     // Handle the easy case that requires no repack instructions.
4152     if (Ty == S32) {
4153       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4154       return true;
4155     }
4156   }
4157 
4158   // Now figure out how to copy the new result register back into the old
4159   // result.
4160   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4161 
4162   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4163 
4164   if (ResultNumRegs == 1) {
4165     assert(!IsTFE);
4166     ResultRegs[0] = NewResultReg;
4167   } else {
4168     // We have to repack into a new vector of some kind.
4169     for (int I = 0; I != NumDataRegs; ++I)
4170       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4171     B.buildUnmerge(ResultRegs, NewResultReg);
4172 
4173     // Drop the final TFE element to get the data part. The TFE result is
4174     // directly written to the right place already.
4175     if (IsTFE)
4176       ResultRegs.resize(NumDataRegs);
4177   }
4178 
4179   // For an s16 scalar result, we form an s32 result with a truncate regardless
4180   // of packed vs. unpacked.
4181   if (IsD16 && !Ty.isVector()) {
4182     B.buildTrunc(DstReg, ResultRegs[0]);
4183     return true;
4184   }
4185 
4186   // Avoid a build/concat_vector of 1 entry.
4187   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4188     B.buildBitcast(DstReg, ResultRegs[0]);
4189     return true;
4190   }
4191 
4192   assert(Ty.isVector());
4193 
4194   if (IsD16) {
4195     // For packed D16 results with TFE enabled, all the data components are
4196     // S32. Cast back to the expected type.
4197     //
4198     // TODO: We don't really need to use load s32 elements. We would only need one
4199     // cast for the TFE result if a multiple of v2s16 was used.
4200     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4201       for (Register &Reg : ResultRegs)
4202         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4203     } else if (ST.hasUnpackedD16VMem()) {
4204       for (Register &Reg : ResultRegs)
4205         Reg = B.buildTrunc(S16, Reg).getReg(0);
4206     }
4207   }
4208 
4209   auto padWithUndef = [&](LLT Ty, int NumElts) {
4210     if (NumElts == 0)
4211       return;
4212     Register Undef = B.buildUndef(Ty).getReg(0);
4213     for (int I = 0; I != NumElts; ++I)
4214       ResultRegs.push_back(Undef);
4215   };
4216 
4217   // Pad out any elements eliminated due to the dmask.
4218   LLT ResTy = MRI->getType(ResultRegs[0]);
4219   if (!ResTy.isVector()) {
4220     padWithUndef(ResTy, NumElts - ResultRegs.size());
4221     B.buildBuildVector(DstReg, ResultRegs);
4222     return true;
4223   }
4224 
4225   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4226   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4227 
4228   // Deal with the one annoying legal case.
4229   const LLT V3S16 = LLT::vector(3, 16);
4230   if (Ty == V3S16) {
4231     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4232     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4233     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4234     return true;
4235   }
4236 
4237   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4238   B.buildConcatVectors(DstReg, ResultRegs);
4239   return true;
4240 }
4241 
4242 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4243   LegalizerHelper &Helper, MachineInstr &MI) const {
4244   MachineIRBuilder &B = Helper.MIRBuilder;
4245   GISelChangeObserver &Observer = Helper.Observer;
4246 
4247   Register Dst = MI.getOperand(0).getReg();
4248   LLT Ty = B.getMRI()->getType(Dst);
4249   unsigned Size = Ty.getSizeInBits();
4250   MachineFunction &MF = B.getMF();
4251 
4252   Observer.changingInstr(MI);
4253 
4254   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4255     Ty = getBitcastRegisterType(Ty);
4256     Helper.bitcastDst(MI, Ty, 0);
4257     Dst = MI.getOperand(0).getReg();
4258     B.setInsertPt(B.getMBB(), MI);
4259   }
4260 
4261   // FIXME: We don't really need this intermediate instruction. The intrinsic
4262   // should be fixed to have a memory operand. Since it's readnone, we're not
4263   // allowed to add one.
4264   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4265   MI.RemoveOperand(1); // Remove intrinsic ID
4266 
4267   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4268   // TODO: Should this use datalayout alignment?
4269   const unsigned MemSize = (Size + 7) / 8;
4270   const Align MemAlign(4);
4271   MachineMemOperand *MMO = MF.getMachineMemOperand(
4272       MachinePointerInfo(),
4273       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4274           MachineMemOperand::MOInvariant,
4275       MemSize, MemAlign);
4276   MI.addMemOperand(MF, MMO);
4277 
4278   // There are no 96-bit result scalar loads, but widening to 128-bit should
4279   // always be legal. We may need to restore this to a 96-bit result if it turns
4280   // out this needs to be converted to a vector load during RegBankSelect.
4281   if (!isPowerOf2_32(Size)) {
4282     if (Ty.isVector())
4283       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4284     else
4285       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4286   }
4287 
4288   Observer.changedInstr(MI);
4289   return true;
4290 }
4291 
4292 // TODO: Move to selection
4293 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4294                                                 MachineRegisterInfo &MRI,
4295                                                 MachineIRBuilder &B) const {
4296   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4297   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4298       !ST.isTrapHandlerEnabled()) {
4299     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4300   } else {
4301     // Pass queue pointer to trap handler as input, and insert trap instruction
4302     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4303     MachineRegisterInfo &MRI = *B.getMRI();
4304 
4305     Register LiveIn =
4306       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4307     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4308       return false;
4309 
4310     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4311     B.buildCopy(SGPR01, LiveIn);
4312     B.buildInstr(AMDGPU::S_TRAP)
4313         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4314         .addReg(SGPR01, RegState::Implicit);
4315   }
4316 
4317   MI.eraseFromParent();
4318   return true;
4319 }
4320 
4321 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4322     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4323   // Is non-HSA path or trap-handler disabled? then, report a warning
4324   // accordingly
4325   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4326       !ST.isTrapHandlerEnabled()) {
4327     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4328                                      "debugtrap handler not supported",
4329                                      MI.getDebugLoc(), DS_Warning);
4330     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4331     Ctx.diagnose(NoTrap);
4332   } else {
4333     // Insert debug-trap instruction
4334     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4335   }
4336 
4337   MI.eraseFromParent();
4338   return true;
4339 }
4340 
4341 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4342                                             MachineInstr &MI) const {
4343   MachineIRBuilder &B = Helper.MIRBuilder;
4344   MachineRegisterInfo &MRI = *B.getMRI();
4345 
4346   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4347   auto IntrID = MI.getIntrinsicID();
4348   switch (IntrID) {
4349   case Intrinsic::amdgcn_if:
4350   case Intrinsic::amdgcn_else: {
4351     MachineInstr *Br = nullptr;
4352     MachineBasicBlock *UncondBrTarget = nullptr;
4353     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4354       const SIRegisterInfo *TRI
4355         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4356 
4357       Register Def = MI.getOperand(1).getReg();
4358       Register Use = MI.getOperand(3).getReg();
4359 
4360       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4361       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4362       if (IntrID == Intrinsic::amdgcn_if) {
4363         B.buildInstr(AMDGPU::SI_IF)
4364           .addDef(Def)
4365           .addUse(Use)
4366           .addMBB(UncondBrTarget);
4367       } else {
4368         B.buildInstr(AMDGPU::SI_ELSE)
4369           .addDef(Def)
4370           .addUse(Use)
4371           .addMBB(UncondBrTarget)
4372           .addImm(0);
4373       }
4374 
4375       if (Br) {
4376         Br->getOperand(0).setMBB(CondBrTarget);
4377       } else {
4378         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4379         // since we're swapping branch targets it needs to be reinserted.
4380         // FIXME: IRTranslator should probably not do this
4381         B.buildBr(*CondBrTarget);
4382       }
4383 
4384       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4385       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4386       MI.eraseFromParent();
4387       BrCond->eraseFromParent();
4388       return true;
4389     }
4390 
4391     return false;
4392   }
4393   case Intrinsic::amdgcn_loop: {
4394     MachineInstr *Br = nullptr;
4395     MachineBasicBlock *UncondBrTarget = nullptr;
4396     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4397       const SIRegisterInfo *TRI
4398         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4399 
4400       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4401       Register Reg = MI.getOperand(2).getReg();
4402 
4403       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4404       B.buildInstr(AMDGPU::SI_LOOP)
4405         .addUse(Reg)
4406         .addMBB(UncondBrTarget);
4407 
4408       if (Br)
4409         Br->getOperand(0).setMBB(CondBrTarget);
4410       else
4411         B.buildBr(*CondBrTarget);
4412 
4413       MI.eraseFromParent();
4414       BrCond->eraseFromParent();
4415       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4416       return true;
4417     }
4418 
4419     return false;
4420   }
4421   case Intrinsic::amdgcn_kernarg_segment_ptr:
4422     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4423       // This only makes sense to call in a kernel, so just lower to null.
4424       B.buildConstant(MI.getOperand(0).getReg(), 0);
4425       MI.eraseFromParent();
4426       return true;
4427     }
4428 
4429     return legalizePreloadedArgIntrin(
4430       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4431   case Intrinsic::amdgcn_implicitarg_ptr:
4432     return legalizeImplicitArgPtr(MI, MRI, B);
4433   case Intrinsic::amdgcn_workitem_id_x:
4434     return legalizePreloadedArgIntrin(MI, MRI, B,
4435                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4436   case Intrinsic::amdgcn_workitem_id_y:
4437     return legalizePreloadedArgIntrin(MI, MRI, B,
4438                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4439   case Intrinsic::amdgcn_workitem_id_z:
4440     return legalizePreloadedArgIntrin(MI, MRI, B,
4441                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4442   case Intrinsic::amdgcn_workgroup_id_x:
4443     return legalizePreloadedArgIntrin(MI, MRI, B,
4444                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4445   case Intrinsic::amdgcn_workgroup_id_y:
4446     return legalizePreloadedArgIntrin(MI, MRI, B,
4447                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4448   case Intrinsic::amdgcn_workgroup_id_z:
4449     return legalizePreloadedArgIntrin(MI, MRI, B,
4450                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4451   case Intrinsic::amdgcn_dispatch_ptr:
4452     return legalizePreloadedArgIntrin(MI, MRI, B,
4453                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4454   case Intrinsic::amdgcn_queue_ptr:
4455     return legalizePreloadedArgIntrin(MI, MRI, B,
4456                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4457   case Intrinsic::amdgcn_implicit_buffer_ptr:
4458     return legalizePreloadedArgIntrin(
4459       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4460   case Intrinsic::amdgcn_dispatch_id:
4461     return legalizePreloadedArgIntrin(MI, MRI, B,
4462                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4463   case Intrinsic::amdgcn_fdiv_fast:
4464     return legalizeFDIVFastIntrin(MI, MRI, B);
4465   case Intrinsic::amdgcn_is_shared:
4466     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4467   case Intrinsic::amdgcn_is_private:
4468     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4469   case Intrinsic::amdgcn_wavefrontsize: {
4470     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4471     MI.eraseFromParent();
4472     return true;
4473   }
4474   case Intrinsic::amdgcn_s_buffer_load:
4475     return legalizeSBufferLoad(Helper, MI);
4476   case Intrinsic::amdgcn_raw_buffer_store:
4477   case Intrinsic::amdgcn_struct_buffer_store:
4478     return legalizeBufferStore(MI, MRI, B, false, false);
4479   case Intrinsic::amdgcn_raw_buffer_store_format:
4480   case Intrinsic::amdgcn_struct_buffer_store_format:
4481     return legalizeBufferStore(MI, MRI, B, false, true);
4482   case Intrinsic::amdgcn_raw_tbuffer_store:
4483   case Intrinsic::amdgcn_struct_tbuffer_store:
4484     return legalizeBufferStore(MI, MRI, B, true, true);
4485   case Intrinsic::amdgcn_raw_buffer_load:
4486   case Intrinsic::amdgcn_struct_buffer_load:
4487     return legalizeBufferLoad(MI, MRI, B, false, false);
4488   case Intrinsic::amdgcn_raw_buffer_load_format:
4489   case Intrinsic::amdgcn_struct_buffer_load_format:
4490     return legalizeBufferLoad(MI, MRI, B, true, false);
4491   case Intrinsic::amdgcn_raw_tbuffer_load:
4492   case Intrinsic::amdgcn_struct_tbuffer_load:
4493     return legalizeBufferLoad(MI, MRI, B, true, true);
4494   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4495   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4496   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4497   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4498   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4499   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4500   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4501   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4502   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4503   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4504   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4505   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4506   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4507   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4508   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4509   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4510   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4511   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4512   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4513   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4514   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4515   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4516   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4517   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4518   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4519   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4520   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4521   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4522     return legalizeBufferAtomic(MI, B, IntrID);
4523   case Intrinsic::amdgcn_atomic_inc:
4524     return legalizeAtomicIncDec(MI, B, true);
4525   case Intrinsic::amdgcn_atomic_dec:
4526     return legalizeAtomicIncDec(MI, B, false);
4527   case Intrinsic::trap:
4528     return legalizeTrapIntrinsic(MI, MRI, B);
4529   case Intrinsic::debugtrap:
4530     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4531   case Intrinsic::amdgcn_rsq_clamp:
4532     return legalizeRsqClampIntrinsic(MI, MRI, B);
4533   case Intrinsic::amdgcn_ds_fadd:
4534   case Intrinsic::amdgcn_ds_fmin:
4535   case Intrinsic::amdgcn_ds_fmax:
4536     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4537   default: {
4538     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4539             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4540       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4541     return true;
4542   }
4543   }
4544 
4545   return true;
4546 }
4547