1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returs true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
111   };
112 }
113 
114 // Increase the number of vector elements to reach the next multiple of 32-bit
115 // type.
116 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
117   return [=](const LegalityQuery &Query) {
118     const LLT Ty = Query.Types[TypeIdx];
119 
120     const LLT EltTy = Ty.getElementType();
121     const int Size = Ty.getSizeInBits();
122     const int EltSize = EltTy.getSizeInBits();
123     const int NextMul32 = (Size + 31) / 32;
124 
125     assert(EltSize < 32);
126 
127     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
128     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
129   };
130 }
131 
132 static LLT getBitcastRegisterType(const LLT Ty) {
133   const unsigned Size = Ty.getSizeInBits();
134 
135   LLT CoercedTy;
136   if (Size <= 32) {
137     // <2 x s8> -> s16
138     // <4 x s8> -> s32
139     return LLT::scalar(Size);
140   }
141 
142   return LLT::scalarOrVector(Size / 32, 32);
143 }
144 
145 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
149   };
150 }
151 
152 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     unsigned Size = Ty.getSizeInBits();
156     assert(Size % 32 == 0);
157     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
158   };
159 }
160 
161 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
165   };
166 }
167 
168 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
172   };
173 }
174 
175 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
176   return [=](const LegalityQuery &Query) {
177     const LLT QueryTy = Query.Types[TypeIdx];
178     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
179   };
180 }
181 
182 static bool isRegisterSize(unsigned Size) {
183   return Size % 32 == 0 && Size <= MaxRegisterSize;
184 }
185 
186 static bool isRegisterVectorElementType(LLT EltTy) {
187   const int EltSize = EltTy.getSizeInBits();
188   return EltSize == 16 || EltSize % 32 == 0;
189 }
190 
191 static bool isRegisterVectorType(LLT Ty) {
192   const int EltSize = Ty.getElementType().getSizeInBits();
193   return EltSize == 32 || EltSize == 64 ||
194          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
195          EltSize == 128 || EltSize == 256;
196 }
197 
198 static bool isRegisterType(LLT Ty) {
199   if (!isRegisterSize(Ty.getSizeInBits()))
200     return false;
201 
202   if (Ty.isVector())
203     return isRegisterVectorType(Ty);
204 
205   return true;
206 }
207 
208 // Any combination of 32 or 64-bit elements up the maximum register size, and
209 // multiples of v2s16.
210 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
211   return [=](const LegalityQuery &Query) {
212     return isRegisterType(Query.Types[TypeIdx]);
213   };
214 }
215 
216 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
217   return [=](const LegalityQuery &Query) {
218     const LLT QueryTy = Query.Types[TypeIdx];
219     if (!QueryTy.isVector())
220       return false;
221     const LLT EltTy = QueryTy.getElementType();
222     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
223   };
224 }
225 
226 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
227   return [=](const LegalityQuery &Query) {
228     const LLT Ty = Query.Types[TypeIdx];
229     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
230            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
231   };
232 }
233 
234 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
235 // handle some operations by just promoting the register during
236 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
237 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
238                                     bool IsLoad) {
239   switch (AS) {
240   case AMDGPUAS::PRIVATE_ADDRESS:
241     // FIXME: Private element size.
242     return 32;
243   case AMDGPUAS::LOCAL_ADDRESS:
244     return ST.useDS128() ? 128 : 64;
245   case AMDGPUAS::GLOBAL_ADDRESS:
246   case AMDGPUAS::CONSTANT_ADDRESS:
247   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
248     // Treat constant and global as identical. SMRD loads are sometimes usable for
249     // global loads (ideally constant address space should be eliminated)
250     // depending on the context. Legality cannot be context dependent, but
251     // RegBankSelect can split the load as necessary depending on the pointer
252     // register bank/uniformity and if the memory is invariant or not written in a
253     // kernel.
254     return IsLoad ? 512 : 128;
255   default:
256     // Flat addresses may contextually need to be split to 32-bit parts if they
257     // may alias scratch depending on the subtarget.
258     return 128;
259   }
260 }
261 
262 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
263                                  const LegalityQuery &Query,
264                                  unsigned Opcode) {
265   const LLT Ty = Query.Types[0];
266 
267   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268   const bool IsLoad = Opcode != AMDGPU::G_STORE;
269 
270   unsigned RegSize = Ty.getSizeInBits();
271   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273   unsigned AS = Query.Types[1].getAddressSpace();
274 
275   // All of these need to be custom lowered to cast the pointer operand.
276   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
277     return false;
278 
279   // TODO: We should be able to widen loads if the alignment is high enough, but
280   // we also need to modify the memory access size.
281 #if 0
282   // Accept widening loads based on alignment.
283   if (IsLoad && MemSize < Size)
284     MemSize = std::max(MemSize, Align);
285 #endif
286 
287   // Only 1-byte and 2-byte to 32-bit extloads are valid.
288   if (MemSize != RegSize && RegSize != 32)
289     return false;
290 
291   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
292     return false;
293 
294   switch (MemSize) {
295   case 8:
296   case 16:
297   case 32:
298   case 64:
299   case 128:
300     break;
301   case 96:
302     if (!ST.hasDwordx3LoadStores())
303       return false;
304     break;
305   case 256:
306   case 512:
307     // These may contextually need to be broken down.
308     break;
309   default:
310     return false;
311   }
312 
313   assert(RegSize >= MemSize);
314 
315   if (AlignBits < MemSize) {
316     const SITargetLowering *TLI = ST.getTargetLowering();
317     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
318                                                  Align(AlignBits / 8)))
319       return false;
320   }
321 
322   return true;
323 }
324 
325 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
326 // workaround this. Eventually it should ignore the type for loads and only care
327 // about the size. Return true in cases where we will workaround this for now by
328 // bitcasting.
329 static bool loadStoreBitcastWorkaround(const LLT Ty) {
330   if (EnableNewLegality)
331     return false;
332 
333   const unsigned Size = Ty.getSizeInBits();
334   if (Size <= 64)
335     return false;
336   if (!Ty.isVector())
337     return true;
338   unsigned EltSize = Ty.getElementType().getSizeInBits();
339   return EltSize != 32 && EltSize != 64;
340 }
341 
342 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
343                              unsigned Opcode) {
344   const LLT Ty = Query.Types[0];
345   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
346          !loadStoreBitcastWorkaround(Ty);
347 }
348 
349 /// Return true if a load or store of the type should be lowered with a bitcast
350 /// to a different type.
351 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
352                                        const unsigned MemSizeInBits) {
353   const unsigned Size = Ty.getSizeInBits();
354     if (Size != MemSizeInBits)
355       return Size <= 32 && Ty.isVector();
356 
357   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
358     return true;
359   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
360          !isRegisterVectorElementType(Ty.getElementType());
361 }
362 
363 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
364                                          const GCNTargetMachine &TM)
365   :  ST(ST_) {
366   using namespace TargetOpcode;
367 
368   auto GetAddrSpacePtr = [&TM](unsigned AS) {
369     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
370   };
371 
372   const LLT S1 = LLT::scalar(1);
373   const LLT S16 = LLT::scalar(16);
374   const LLT S32 = LLT::scalar(32);
375   const LLT S64 = LLT::scalar(64);
376   const LLT S128 = LLT::scalar(128);
377   const LLT S256 = LLT::scalar(256);
378   const LLT S512 = LLT::scalar(512);
379   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
380 
381   const LLT V2S16 = LLT::vector(2, 16);
382   const LLT V4S16 = LLT::vector(4, 16);
383 
384   const LLT V2S32 = LLT::vector(2, 32);
385   const LLT V3S32 = LLT::vector(3, 32);
386   const LLT V4S32 = LLT::vector(4, 32);
387   const LLT V5S32 = LLT::vector(5, 32);
388   const LLT V6S32 = LLT::vector(6, 32);
389   const LLT V7S32 = LLT::vector(7, 32);
390   const LLT V8S32 = LLT::vector(8, 32);
391   const LLT V9S32 = LLT::vector(9, 32);
392   const LLT V10S32 = LLT::vector(10, 32);
393   const LLT V11S32 = LLT::vector(11, 32);
394   const LLT V12S32 = LLT::vector(12, 32);
395   const LLT V13S32 = LLT::vector(13, 32);
396   const LLT V14S32 = LLT::vector(14, 32);
397   const LLT V15S32 = LLT::vector(15, 32);
398   const LLT V16S32 = LLT::vector(16, 32);
399   const LLT V32S32 = LLT::vector(32, 32);
400 
401   const LLT V2S64 = LLT::vector(2, 64);
402   const LLT V3S64 = LLT::vector(3, 64);
403   const LLT V4S64 = LLT::vector(4, 64);
404   const LLT V5S64 = LLT::vector(5, 64);
405   const LLT V6S64 = LLT::vector(6, 64);
406   const LLT V7S64 = LLT::vector(7, 64);
407   const LLT V8S64 = LLT::vector(8, 64);
408   const LLT V16S64 = LLT::vector(16, 64);
409 
410   std::initializer_list<LLT> AllS32Vectors =
411     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
412      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
413   std::initializer_list<LLT> AllS64Vectors =
414     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
415 
416   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
417   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
418   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
419   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
420   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
421   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
422   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
423 
424   const LLT CodePtr = FlatPtr;
425 
426   const std::initializer_list<LLT> AddrSpaces64 = {
427     GlobalPtr, ConstantPtr, FlatPtr
428   };
429 
430   const std::initializer_list<LLT> AddrSpaces32 = {
431     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
432   };
433 
434   const std::initializer_list<LLT> FPTypesBase = {
435     S32, S64
436   };
437 
438   const std::initializer_list<LLT> FPTypes16 = {
439     S32, S64, S16
440   };
441 
442   const std::initializer_list<LLT> FPTypesPK16 = {
443     S32, S64, S16, V2S16
444   };
445 
446   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
447 
448   setAction({G_BRCOND, S1}, Legal); // VCC branches
449   setAction({G_BRCOND, S32}, Legal); // SCC branches
450 
451   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
452   // elements for v3s16
453   getActionDefinitionsBuilder(G_PHI)
454     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
455     .legalFor(AllS32Vectors)
456     .legalFor(AllS64Vectors)
457     .legalFor(AddrSpaces64)
458     .legalFor(AddrSpaces32)
459     .legalIf(isPointer(0))
460     .clampScalar(0, S16, S256)
461     .widenScalarToNextPow2(0, 32)
462     .clampMaxNumElements(0, S32, 16)
463     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
464     .scalarize(0);
465 
466   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
467     // Full set of gfx9 features.
468     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
469       .legalFor({S32, S16, V2S16})
470       .clampScalar(0, S16, S32)
471       .clampMaxNumElements(0, S16, 2)
472       .scalarize(0)
473       .widenScalarToNextPow2(0, 32);
474 
475     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
476       .legalFor({S32, S16, V2S16}) // Clamp modifier
477       .minScalarOrElt(0, S16)
478       .clampMaxNumElements(0, S16, 2)
479       .scalarize(0)
480       .widenScalarToNextPow2(0, 32)
481       .lower();
482   } else if (ST.has16BitInsts()) {
483     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
484       .legalFor({S32, S16})
485       .clampScalar(0, S16, S32)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
488 
489     // Technically the saturating operations require clamp bit support, but this
490     // was introduced at the same time as 16-bit operations.
491     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
492       .legalFor({S32, S16}) // Clamp modifier
493       .minScalar(0, S16)
494       .scalarize(0)
495       .widenScalarToNextPow2(0, 16)
496       .lower();
497 
498     // We're just lowering this, but it helps get a better result to try to
499     // coerce to the desired type first.
500     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
501       .minScalar(0, S16)
502       .scalarize(0)
503       .lower();
504   } else {
505     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
506       .legalFor({S32})
507       .clampScalar(0, S32, S32)
508       .scalarize(0);
509 
510     if (ST.hasIntClamp()) {
511       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
512         .legalFor({S32}) // Clamp modifier.
513         .scalarize(0)
514         .minScalarOrElt(0, S32)
515         .lower();
516     } else {
517       // Clamp bit support was added in VI, along with 16-bit operations.
518       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
519         .minScalar(0, S32)
520         .scalarize(0)
521         .lower();
522     }
523 
524     // FIXME: DAG expansion gets better results. The widening uses the smaller
525     // range values and goes for the min/max lowering directly.
526     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
527       .minScalar(0, S32)
528       .scalarize(0)
529       .lower();
530   }
531 
532   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
533     .customFor({S32, S64})
534     .clampScalar(0, S32, S64)
535     .widenScalarToNextPow2(0, 32)
536     .scalarize(0);
537 
538   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
539     .legalFor({S32})
540     .clampScalar(0, S32, S32)
541     .scalarize(0);
542 
543   // Report legal for any types we can handle anywhere. For the cases only legal
544   // on the SALU, RegBankSelect will be able to re-legalize.
545   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
546     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
547     .clampScalar(0, S32, S64)
548     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
549     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
550     .widenScalarToNextPow2(0)
551     .scalarize(0);
552 
553   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
554                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
555     .legalFor({{S32, S1}, {S32, S32}})
556     .minScalar(0, S32)
557     // TODO: .scalarize(0)
558     .lower();
559 
560   getActionDefinitionsBuilder(G_BITCAST)
561     // Don't worry about the size constraint.
562     .legalIf(all(isRegisterType(0), isRegisterType(1)))
563     .lower();
564 
565 
566   getActionDefinitionsBuilder(G_CONSTANT)
567     .legalFor({S1, S32, S64, S16, GlobalPtr,
568                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
569     .legalIf(isPointer(0))
570     .clampScalar(0, S32, S64)
571     .widenScalarToNextPow2(0);
572 
573   getActionDefinitionsBuilder(G_FCONSTANT)
574     .legalFor({S32, S64, S16})
575     .clampScalar(0, S16, S64);
576 
577   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
578       .legalIf(isRegisterType(0))
579       // s1 and s16 are special cases because they have legal operations on
580       // them, but don't really occupy registers in the normal way.
581       .legalFor({S1, S16})
582       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
583       .clampScalarOrElt(0, S32, MaxScalar)
584       .widenScalarToNextPow2(0, 32)
585       .clampMaxNumElements(0, S32, 16);
586 
587   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
588 
589   // If the amount is divergent, we have to do a wave reduction to get the
590   // maximum value, so this is expanded during RegBankSelect.
591   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
592     .legalFor({{PrivatePtr, S32}});
593 
594   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
595     .customIf(typeIsNot(0, PrivatePtr));
596 
597   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
598 
599   auto &FPOpActions = getActionDefinitionsBuilder(
600     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
601     .legalFor({S32, S64});
602   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
603     .customFor({S32, S64});
604   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
605     .customFor({S32, S64});
606 
607   if (ST.has16BitInsts()) {
608     if (ST.hasVOP3PInsts())
609       FPOpActions.legalFor({S16, V2S16});
610     else
611       FPOpActions.legalFor({S16});
612 
613     TrigActions.customFor({S16});
614     FDIVActions.customFor({S16});
615   }
616 
617   auto &MinNumMaxNum = getActionDefinitionsBuilder({
618       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
619 
620   if (ST.hasVOP3PInsts()) {
621     MinNumMaxNum.customFor(FPTypesPK16)
622       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
623       .clampMaxNumElements(0, S16, 2)
624       .clampScalar(0, S16, S64)
625       .scalarize(0);
626   } else if (ST.has16BitInsts()) {
627     MinNumMaxNum.customFor(FPTypes16)
628       .clampScalar(0, S16, S64)
629       .scalarize(0);
630   } else {
631     MinNumMaxNum.customFor(FPTypesBase)
632       .clampScalar(0, S32, S64)
633       .scalarize(0);
634   }
635 
636   if (ST.hasVOP3PInsts())
637     FPOpActions.clampMaxNumElements(0, S16, 2);
638 
639   FPOpActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   TrigActions
644     .scalarize(0)
645     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
646 
647   FDIVActions
648     .scalarize(0)
649     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
650 
651   getActionDefinitionsBuilder({G_FNEG, G_FABS})
652     .legalFor(FPTypesPK16)
653     .clampMaxNumElements(0, S16, 2)
654     .scalarize(0)
655     .clampScalar(0, S16, S64);
656 
657   if (ST.has16BitInsts()) {
658     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
659       .legalFor({S32, S64, S16})
660       .scalarize(0)
661       .clampScalar(0, S16, S64);
662   } else {
663     getActionDefinitionsBuilder(G_FSQRT)
664       .legalFor({S32, S64})
665       .scalarize(0)
666       .clampScalar(0, S32, S64);
667 
668     if (ST.hasFractBug()) {
669       getActionDefinitionsBuilder(G_FFLOOR)
670         .customFor({S64})
671         .legalFor({S32, S64})
672         .scalarize(0)
673         .clampScalar(0, S32, S64);
674     } else {
675       getActionDefinitionsBuilder(G_FFLOOR)
676         .legalFor({S32, S64})
677         .scalarize(0)
678         .clampScalar(0, S32, S64);
679     }
680   }
681 
682   getActionDefinitionsBuilder(G_FPTRUNC)
683     .legalFor({{S32, S64}, {S16, S32}})
684     .scalarize(0)
685     .lower();
686 
687   getActionDefinitionsBuilder(G_FPEXT)
688     .legalFor({{S64, S32}, {S32, S16}})
689     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
690     .scalarize(0);
691 
692   getActionDefinitionsBuilder(G_FSUB)
693       // Use actual fsub instruction
694       .legalFor({S32})
695       // Must use fadd + fneg
696       .lowerFor({S64, S16, V2S16})
697       .scalarize(0)
698       .clampScalar(0, S32, S64);
699 
700   // Whether this is legal depends on the floating point mode for the function.
701   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
702   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
703     FMad.customFor({S32, S16});
704   else if (ST.hasMadMacF32Insts())
705     FMad.customFor({S32});
706   else if (ST.hasMadF16())
707     FMad.customFor({S16});
708   FMad.scalarize(0)
709       .lower();
710 
711   auto &FRem = getActionDefinitionsBuilder(G_FREM);
712   if (ST.has16BitInsts()) {
713     FRem.customFor({S16, S32, S64});
714   } else {
715     FRem.minScalar(0, S32)
716         .customFor({S32, S64});
717   }
718   FRem.scalarize(0);
719 
720   // TODO: Do we need to clamp maximum bitwidth?
721   getActionDefinitionsBuilder(G_TRUNC)
722     .legalIf(isScalar(0))
723     .legalFor({{V2S16, V2S32}})
724     .clampMaxNumElements(0, S16, 2)
725     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
726     // situations (like an invalid implicit use), we don't want to infinite loop
727     // in the legalizer.
728     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
729     .alwaysLegal();
730 
731   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
732     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
733                {S32, S1}, {S64, S1}, {S16, S1}})
734     .scalarize(0)
735     .clampScalar(0, S32, S64)
736     .widenScalarToNextPow2(1, 32);
737 
738   // TODO: Split s1->s64 during regbankselect for VALU.
739   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
740     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
741     .lowerFor({{S32, S64}})
742     .lowerIf(typeIs(1, S1))
743     .customFor({{S64, S64}});
744   if (ST.has16BitInsts())
745     IToFP.legalFor({{S16, S16}});
746   IToFP.clampScalar(1, S32, S64)
747        .minScalar(0, S32)
748        .scalarize(0)
749        .widenScalarToNextPow2(1);
750 
751   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
752     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
753     .customFor({{S64, S64}})
754     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
755   if (ST.has16BitInsts())
756     FPToI.legalFor({{S16, S16}});
757   else
758     FPToI.minScalar(1, S32);
759 
760   FPToI.minScalar(0, S32)
761        .scalarize(0)
762        .lower();
763 
764   // Lower roundeven into G_FRINT
765   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
766     .scalarize(0)
767     .lower();
768 
769   if (ST.has16BitInsts()) {
770     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
771       .legalFor({S16, S32, S64})
772       .clampScalar(0, S16, S64)
773       .scalarize(0);
774   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
775     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
776       .legalFor({S32, S64})
777       .clampScalar(0, S32, S64)
778       .scalarize(0);
779   } else {
780     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
781       .legalFor({S32})
782       .customFor({S64})
783       .clampScalar(0, S32, S64)
784       .scalarize(0);
785   }
786 
787   getActionDefinitionsBuilder(G_PTR_ADD)
788     .legalIf(all(isPointer(0), sameSize(0, 1)))
789     .scalarize(0)
790     .scalarSameSizeAs(1, 0);
791 
792   getActionDefinitionsBuilder(G_PTRMASK)
793     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
794     .scalarSameSizeAs(1, 0)
795     .scalarize(0);
796 
797   auto &CmpBuilder =
798     getActionDefinitionsBuilder(G_ICMP)
799     // The compare output type differs based on the register bank of the output,
800     // so make both s1 and s32 legal.
801     //
802     // Scalar compares producing output in scc will be promoted to s32, as that
803     // is the allocatable register type that will be needed for the copy from
804     // scc. This will be promoted during RegBankSelect, and we assume something
805     // before that won't try to use s32 result types.
806     //
807     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
808     // bank.
809     .legalForCartesianProduct(
810       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
811     .legalForCartesianProduct(
812       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
813   if (ST.has16BitInsts()) {
814     CmpBuilder.legalFor({{S1, S16}});
815   }
816 
817   CmpBuilder
818     .widenScalarToNextPow2(1)
819     .clampScalar(1, S32, S64)
820     .scalarize(0)
821     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
822 
823   getActionDefinitionsBuilder(G_FCMP)
824     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
825     .widenScalarToNextPow2(1)
826     .clampScalar(1, S32, S64)
827     .scalarize(0);
828 
829   // FIXME: fpow has a selection pattern that should move to custom lowering.
830   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
831   if (ST.has16BitInsts())
832     Exp2Ops.legalFor({S32, S16});
833   else
834     Exp2Ops.legalFor({S32});
835   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
836   Exp2Ops.scalarize(0);
837 
838   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
839   if (ST.has16BitInsts())
840     ExpOps.customFor({{S32}, {S16}});
841   else
842     ExpOps.customFor({S32});
843   ExpOps.clampScalar(0, MinScalarFPTy, S32)
844         .scalarize(0);
845 
846   getActionDefinitionsBuilder(G_FPOWI)
847     .clampScalar(0, MinScalarFPTy, S32)
848     .lower();
849 
850   // The 64-bit versions produce 32-bit results, but only on the SALU.
851   getActionDefinitionsBuilder(G_CTPOP)
852     .legalFor({{S32, S32}, {S32, S64}})
853     .clampScalar(0, S32, S32)
854     .clampScalar(1, S32, S64)
855     .scalarize(0)
856     .widenScalarToNextPow2(0, 32)
857     .widenScalarToNextPow2(1, 32);
858 
859   // The hardware instructions return a different result on 0 than the generic
860   // instructions expect. The hardware produces -1, but these produce the
861   // bitwidth.
862   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
863     .scalarize(0)
864     .clampScalar(0, S32, S32)
865     .clampScalar(1, S32, S64)
866     .widenScalarToNextPow2(0, 32)
867     .widenScalarToNextPow2(1, 32)
868     .lower();
869 
870   // The 64-bit versions produce 32-bit results, but only on the SALU.
871   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
872     .legalFor({{S32, S32}, {S32, S64}})
873     .clampScalar(0, S32, S32)
874     .clampScalar(1, S32, S64)
875     .scalarize(0)
876     .widenScalarToNextPow2(0, 32)
877     .widenScalarToNextPow2(1, 32);
878 
879   getActionDefinitionsBuilder(G_BITREVERSE)
880     .legalFor({S32})
881     .clampScalar(0, S32, S32)
882     .scalarize(0);
883 
884   if (ST.has16BitInsts()) {
885     getActionDefinitionsBuilder(G_BSWAP)
886       .legalFor({S16, S32, V2S16})
887       .clampMaxNumElements(0, S16, 2)
888       // FIXME: Fixing non-power-of-2 before clamp is workaround for
889       // narrowScalar limitation.
890       .widenScalarToNextPow2(0)
891       .clampScalar(0, S16, S32)
892       .scalarize(0);
893 
894     if (ST.hasVOP3PInsts()) {
895       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
896         .legalFor({S32, S16, V2S16})
897         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
898         .clampMaxNumElements(0, S16, 2)
899         .minScalar(0, S16)
900         .widenScalarToNextPow2(0)
901         .scalarize(0)
902         .lower();
903     } else {
904       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
905         .legalFor({S32, S16})
906         .widenScalarToNextPow2(0)
907         .minScalar(0, S16)
908         .scalarize(0)
909         .lower();
910     }
911   } else {
912     // TODO: Should have same legality without v_perm_b32
913     getActionDefinitionsBuilder(G_BSWAP)
914       .legalFor({S32})
915       .lowerIf(scalarNarrowerThan(0, 32))
916       // FIXME: Fixing non-power-of-2 before clamp is workaround for
917       // narrowScalar limitation.
918       .widenScalarToNextPow2(0)
919       .maxScalar(0, S32)
920       .scalarize(0)
921       .lower();
922 
923     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
924       .legalFor({S32})
925       .minScalar(0, S32)
926       .widenScalarToNextPow2(0)
927       .scalarize(0)
928       .lower();
929   }
930 
931   getActionDefinitionsBuilder(G_INTTOPTR)
932     // List the common cases
933     .legalForCartesianProduct(AddrSpaces64, {S64})
934     .legalForCartesianProduct(AddrSpaces32, {S32})
935     .scalarize(0)
936     // Accept any address space as long as the size matches
937     .legalIf(sameSize(0, 1))
938     .widenScalarIf(smallerThan(1, 0),
939       [](const LegalityQuery &Query) {
940         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
941       })
942     .narrowScalarIf(largerThan(1, 0),
943       [](const LegalityQuery &Query) {
944         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
945       });
946 
947   getActionDefinitionsBuilder(G_PTRTOINT)
948     // List the common cases
949     .legalForCartesianProduct(AddrSpaces64, {S64})
950     .legalForCartesianProduct(AddrSpaces32, {S32})
951     .scalarize(0)
952     // Accept any address space as long as the size matches
953     .legalIf(sameSize(0, 1))
954     .widenScalarIf(smallerThan(0, 1),
955       [](const LegalityQuery &Query) {
956         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
957       })
958     .narrowScalarIf(
959       largerThan(0, 1),
960       [](const LegalityQuery &Query) {
961         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
962       });
963 
964   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
965     .scalarize(0)
966     .custom();
967 
968   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
969                                     bool IsLoad) -> bool {
970     const LLT DstTy = Query.Types[0];
971 
972     // Split vector extloads.
973     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
974     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
975 
976     if (MemSize < DstTy.getSizeInBits())
977       MemSize = std::max(MemSize, AlignBits);
978 
979     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
980       return true;
981 
982     const LLT PtrTy = Query.Types[1];
983     unsigned AS = PtrTy.getAddressSpace();
984     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
985       return true;
986 
987     // Catch weird sized loads that don't evenly divide into the access sizes
988     // TODO: May be able to widen depending on alignment etc.
989     unsigned NumRegs = (MemSize + 31) / 32;
990     if (NumRegs == 3) {
991       if (!ST.hasDwordx3LoadStores())
992         return true;
993     } else {
994       // If the alignment allows, these should have been widened.
995       if (!isPowerOf2_32(NumRegs))
996         return true;
997     }
998 
999     if (AlignBits < MemSize) {
1000       const SITargetLowering *TLI = ST.getTargetLowering();
1001       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1002                                                       Align(AlignBits / 8));
1003     }
1004 
1005     return false;
1006   };
1007 
1008   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
1009                                          unsigned Opc) -> bool {
1010     unsigned Size = Query.Types[0].getSizeInBits();
1011     if (isPowerOf2_32(Size))
1012       return false;
1013 
1014     if (Size == 96 && ST.hasDwordx3LoadStores())
1015       return false;
1016 
1017     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1018     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1019       return false;
1020 
1021     unsigned Align = Query.MMODescrs[0].AlignInBits;
1022     unsigned RoundedSize = NextPowerOf2(Size);
1023     return (Align >= RoundedSize);
1024   };
1025 
1026   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1027   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1028   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1029 
1030   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1031   // LDS
1032   // TODO: Unsupported flat for SI.
1033 
1034   for (unsigned Op : {G_LOAD, G_STORE}) {
1035     const bool IsStore = Op == G_STORE;
1036 
1037     auto &Actions = getActionDefinitionsBuilder(Op);
1038     // Explicitly list some common cases.
1039     // TODO: Does this help compile time at all?
1040     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1041                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1042                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1043                                       {S64, GlobalPtr, 64, GlobalAlign32},
1044                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1045                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1046                                       {S32, GlobalPtr, 8, GlobalAlign8},
1047                                       {S32, GlobalPtr, 16, GlobalAlign16},
1048 
1049                                       {S32, LocalPtr, 32, 32},
1050                                       {S64, LocalPtr, 64, 32},
1051                                       {V2S32, LocalPtr, 64, 32},
1052                                       {S32, LocalPtr, 8, 8},
1053                                       {S32, LocalPtr, 16, 16},
1054                                       {V2S16, LocalPtr, 32, 32},
1055 
1056                                       {S32, PrivatePtr, 32, 32},
1057                                       {S32, PrivatePtr, 8, 8},
1058                                       {S32, PrivatePtr, 16, 16},
1059                                       {V2S16, PrivatePtr, 32, 32},
1060 
1061                                       {S32, ConstantPtr, 32, GlobalAlign32},
1062                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1063                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1064                                       {S64, ConstantPtr, 64, GlobalAlign32},
1065                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1066     Actions.legalIf(
1067       [=](const LegalityQuery &Query) -> bool {
1068         return isLoadStoreLegal(ST, Query, Op);
1069       });
1070 
1071     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1072     // 64-bits.
1073     //
1074     // TODO: Should generalize bitcast action into coerce, which will also cover
1075     // inserting addrspacecasts.
1076     Actions.customIf(typeIs(1, Constant32Ptr));
1077 
1078     // Turn any illegal element vectors into something easier to deal
1079     // with. These will ultimately produce 32-bit scalar shifts to extract the
1080     // parts anyway.
1081     //
1082     // For odd 16-bit element vectors, prefer to split those into pieces with
1083     // 16-bit vector parts.
1084     Actions.bitcastIf(
1085       [=](const LegalityQuery &Query) -> bool {
1086         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1087                                           Query.MMODescrs[0].SizeInBits);
1088       }, bitcastToRegisterType(0));
1089 
1090     Actions
1091         .customIf(typeIs(1, Constant32Ptr))
1092         // Widen suitably aligned loads by loading extra elements.
1093         .moreElementsIf([=](const LegalityQuery &Query) {
1094             const LLT Ty = Query.Types[0];
1095             return Op == G_LOAD && Ty.isVector() &&
1096                    shouldWidenLoadResult(Query, Op);
1097           }, moreElementsToNextPow2(0))
1098         .widenScalarIf([=](const LegalityQuery &Query) {
1099             const LLT Ty = Query.Types[0];
1100             return Op == G_LOAD && !Ty.isVector() &&
1101                    shouldWidenLoadResult(Query, Op);
1102           }, widenScalarOrEltToNextPow2(0))
1103         .narrowScalarIf(
1104             [=](const LegalityQuery &Query) -> bool {
1105               return !Query.Types[0].isVector() &&
1106                      needToSplitMemOp(Query, Op == G_LOAD);
1107             },
1108             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1109               const LLT DstTy = Query.Types[0];
1110               const LLT PtrTy = Query.Types[1];
1111 
1112               const unsigned DstSize = DstTy.getSizeInBits();
1113               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1114 
1115               // Split extloads.
1116               if (DstSize > MemSize)
1117                 return std::make_pair(0, LLT::scalar(MemSize));
1118 
1119               if (!isPowerOf2_32(DstSize)) {
1120                 // We're probably decomposing an odd sized store. Try to split
1121                 // to the widest type. TODO: Account for alignment. As-is it
1122                 // should be OK, since the new parts will be further legalized.
1123                 unsigned FloorSize = PowerOf2Floor(DstSize);
1124                 return std::make_pair(0, LLT::scalar(FloorSize));
1125               }
1126 
1127               if (DstSize > 32 && (DstSize % 32 != 0)) {
1128                 // FIXME: Need a way to specify non-extload of larger size if
1129                 // suitably aligned.
1130                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1131               }
1132 
1133               unsigned MaxSize = maxSizeForAddrSpace(ST,
1134                                                      PtrTy.getAddressSpace(),
1135                                                      Op == G_LOAD);
1136               if (MemSize > MaxSize)
1137                 return std::make_pair(0, LLT::scalar(MaxSize));
1138 
1139               unsigned Align = Query.MMODescrs[0].AlignInBits;
1140               return std::make_pair(0, LLT::scalar(Align));
1141             })
1142         .fewerElementsIf(
1143             [=](const LegalityQuery &Query) -> bool {
1144               return Query.Types[0].isVector() &&
1145                      needToSplitMemOp(Query, Op == G_LOAD);
1146             },
1147             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1148               const LLT DstTy = Query.Types[0];
1149               const LLT PtrTy = Query.Types[1];
1150 
1151               LLT EltTy = DstTy.getElementType();
1152               unsigned MaxSize = maxSizeForAddrSpace(ST,
1153                                                      PtrTy.getAddressSpace(),
1154                                                      Op == G_LOAD);
1155 
1156               // FIXME: Handle widened to power of 2 results better. This ends
1157               // up scalarizing.
1158               // FIXME: 3 element stores scalarized on SI
1159 
1160               // Split if it's too large for the address space.
1161               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1162                 unsigned NumElts = DstTy.getNumElements();
1163                 unsigned EltSize = EltTy.getSizeInBits();
1164 
1165                 if (MaxSize % EltSize == 0) {
1166                   return std::make_pair(
1167                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1168                 }
1169 
1170                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1171 
1172                 // FIXME: Refine when odd breakdowns handled
1173                 // The scalars will need to be re-legalized.
1174                 if (NumPieces == 1 || NumPieces >= NumElts ||
1175                     NumElts % NumPieces != 0)
1176                   return std::make_pair(0, EltTy);
1177 
1178                 return std::make_pair(0,
1179                                       LLT::vector(NumElts / NumPieces, EltTy));
1180               }
1181 
1182               // FIXME: We could probably handle weird extending loads better.
1183               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1184               if (DstTy.getSizeInBits() > MemSize)
1185                 return std::make_pair(0, EltTy);
1186 
1187               unsigned EltSize = EltTy.getSizeInBits();
1188               unsigned DstSize = DstTy.getSizeInBits();
1189               if (!isPowerOf2_32(DstSize)) {
1190                 // We're probably decomposing an odd sized store. Try to split
1191                 // to the widest type. TODO: Account for alignment. As-is it
1192                 // should be OK, since the new parts will be further legalized.
1193                 unsigned FloorSize = PowerOf2Floor(DstSize);
1194                 return std::make_pair(
1195                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1196               }
1197 
1198               // Need to split because of alignment.
1199               unsigned Align = Query.MMODescrs[0].AlignInBits;
1200               if (EltSize > Align &&
1201                   (EltSize / Align < DstTy.getNumElements())) {
1202                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1203               }
1204 
1205               // May need relegalization for the scalars.
1206               return std::make_pair(0, EltTy);
1207             })
1208         .minScalar(0, S32);
1209 
1210     if (IsStore)
1211       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1212 
1213     // TODO: Need a bitcast lower option?
1214     Actions
1215         .widenScalarToNextPow2(0)
1216         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1217   }
1218 
1219   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1220                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1221                                                   {S32, GlobalPtr, 16, 2 * 8},
1222                                                   {S32, LocalPtr, 8, 8},
1223                                                   {S32, LocalPtr, 16, 16},
1224                                                   {S32, PrivatePtr, 8, 8},
1225                                                   {S32, PrivatePtr, 16, 16},
1226                                                   {S32, ConstantPtr, 8, 8},
1227                                                   {S32, ConstantPtr, 16, 2 * 8}});
1228   if (ST.hasFlatAddressSpace()) {
1229     ExtLoads.legalForTypesWithMemDesc(
1230         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1231   }
1232 
1233   ExtLoads.clampScalar(0, S32, S32)
1234           .widenScalarToNextPow2(0)
1235           .unsupportedIfMemSizeNotPow2()
1236           .lower();
1237 
1238   auto &Atomics = getActionDefinitionsBuilder(
1239     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1240      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1241      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1242      G_ATOMICRMW_UMIN})
1243     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1244                {S64, GlobalPtr}, {S64, LocalPtr},
1245                {S32, RegionPtr}, {S64, RegionPtr}});
1246   if (ST.hasFlatAddressSpace()) {
1247     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1248   }
1249 
1250   if (ST.hasLDSFPAtomics()) {
1251     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1252       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1253   }
1254 
1255   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1256   // demarshalling
1257   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1258     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1259                 {S32, FlatPtr}, {S64, FlatPtr}})
1260     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1261                {S32, RegionPtr}, {S64, RegionPtr}});
1262   // TODO: Pointer types, any 32-bit or 64-bit vector
1263 
1264   // Condition should be s32 for scalar, s1 for vector.
1265   getActionDefinitionsBuilder(G_SELECT)
1266     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1267           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1268           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1269     .clampScalar(0, S16, S64)
1270     .scalarize(1)
1271     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1272     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1273     .clampMaxNumElements(0, S32, 2)
1274     .clampMaxNumElements(0, LocalPtr, 2)
1275     .clampMaxNumElements(0, PrivatePtr, 2)
1276     .scalarize(0)
1277     .widenScalarToNextPow2(0)
1278     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1279 
1280   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1281   // be more flexible with the shift amount type.
1282   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1283     .legalFor({{S32, S32}, {S64, S32}});
1284   if (ST.has16BitInsts()) {
1285     if (ST.hasVOP3PInsts()) {
1286       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1287             .clampMaxNumElements(0, S16, 2);
1288     } else
1289       Shifts.legalFor({{S16, S16}});
1290 
1291     // TODO: Support 16-bit shift amounts for all types
1292     Shifts.widenScalarIf(
1293       [=](const LegalityQuery &Query) {
1294         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1295         // 32-bit amount.
1296         const LLT ValTy = Query.Types[0];
1297         const LLT AmountTy = Query.Types[1];
1298         return ValTy.getSizeInBits() <= 16 &&
1299                AmountTy.getSizeInBits() < 16;
1300       }, changeTo(1, S16));
1301     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1302     Shifts.clampScalar(1, S32, S32);
1303     Shifts.clampScalar(0, S16, S64);
1304     Shifts.widenScalarToNextPow2(0, 16);
1305 
1306     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1307       .minScalar(0, S16)
1308       .scalarize(0)
1309       .lower();
1310   } else {
1311     // Make sure we legalize the shift amount type first, as the general
1312     // expansion for the shifted type will produce much worse code if it hasn't
1313     // been truncated already.
1314     Shifts.clampScalar(1, S32, S32);
1315     Shifts.clampScalar(0, S32, S64);
1316     Shifts.widenScalarToNextPow2(0, 32);
1317 
1318     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1319       .minScalar(0, S32)
1320       .scalarize(0)
1321       .lower();
1322   }
1323   Shifts.scalarize(0);
1324 
1325   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1326     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1327     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1328     unsigned IdxTypeIdx = 2;
1329 
1330     getActionDefinitionsBuilder(Op)
1331       .customIf([=](const LegalityQuery &Query) {
1332           const LLT EltTy = Query.Types[EltTypeIdx];
1333           const LLT VecTy = Query.Types[VecTypeIdx];
1334           const LLT IdxTy = Query.Types[IdxTypeIdx];
1335           const unsigned EltSize = EltTy.getSizeInBits();
1336           return (EltSize == 32 || EltSize == 64) &&
1337                   VecTy.getSizeInBits() % 32 == 0 &&
1338                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1339                   IdxTy.getSizeInBits() == 32;
1340         })
1341       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1342                  bitcastToVectorElement32(VecTypeIdx))
1343       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1344       .bitcastIf(
1345         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1346         [=](const LegalityQuery &Query) {
1347           // For > 64-bit element types, try to turn this into a 64-bit
1348           // element vector since we may be able to do better indexing
1349           // if this is scalar. If not, fall back to 32.
1350           const LLT EltTy = Query.Types[EltTypeIdx];
1351           const LLT VecTy = Query.Types[VecTypeIdx];
1352           const unsigned DstEltSize = EltTy.getSizeInBits();
1353           const unsigned VecSize = VecTy.getSizeInBits();
1354 
1355           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1356           return std::make_pair(
1357             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1358         })
1359       .clampScalar(EltTypeIdx, S32, S64)
1360       .clampScalar(VecTypeIdx, S32, S64)
1361       .clampScalar(IdxTypeIdx, S32, S32)
1362       .clampMaxNumElements(VecTypeIdx, S32, 32)
1363       // TODO: Clamp elements for 64-bit vectors?
1364       // It should only be necessary with variable indexes.
1365       // As a last resort, lower to the stack
1366       .lower();
1367   }
1368 
1369   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1370     .unsupportedIf([=](const LegalityQuery &Query) {
1371         const LLT &EltTy = Query.Types[1].getElementType();
1372         return Query.Types[0] != EltTy;
1373       });
1374 
1375   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1376     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1377     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1378 
1379     // FIXME: Doesn't handle extract of illegal sizes.
1380     getActionDefinitionsBuilder(Op)
1381       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1382       // FIXME: Multiples of 16 should not be legal.
1383       .legalIf([=](const LegalityQuery &Query) {
1384           const LLT BigTy = Query.Types[BigTyIdx];
1385           const LLT LitTy = Query.Types[LitTyIdx];
1386           return (BigTy.getSizeInBits() % 32 == 0) &&
1387                  (LitTy.getSizeInBits() % 16 == 0);
1388         })
1389       .widenScalarIf(
1390         [=](const LegalityQuery &Query) {
1391           const LLT BigTy = Query.Types[BigTyIdx];
1392           return (BigTy.getScalarSizeInBits() < 16);
1393         },
1394         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1395       .widenScalarIf(
1396         [=](const LegalityQuery &Query) {
1397           const LLT LitTy = Query.Types[LitTyIdx];
1398           return (LitTy.getScalarSizeInBits() < 16);
1399         },
1400         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1401       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1402       .widenScalarToNextPow2(BigTyIdx, 32);
1403 
1404   }
1405 
1406   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1407     .legalForCartesianProduct(AllS32Vectors, {S32})
1408     .legalForCartesianProduct(AllS64Vectors, {S64})
1409     .clampNumElements(0, V16S32, V32S32)
1410     .clampNumElements(0, V2S64, V16S64)
1411     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1412 
1413   if (ST.hasScalarPackInsts()) {
1414     BuildVector
1415       // FIXME: Should probably widen s1 vectors straight to s32
1416       .minScalarOrElt(0, S16)
1417       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1418       .minScalar(1, S32);
1419 
1420     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1421       .legalFor({V2S16, S32})
1422       .lower();
1423     BuildVector.minScalarOrElt(0, S32);
1424   } else {
1425     BuildVector.customFor({V2S16, S16});
1426     BuildVector.minScalarOrElt(0, S32);
1427 
1428     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1429       .customFor({V2S16, S32})
1430       .lower();
1431   }
1432 
1433   BuildVector.legalIf(isRegisterType(0));
1434 
1435   // FIXME: Clamp maximum size
1436   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1437     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1438     .clampMaxNumElements(0, S32, 32)
1439     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1440     .clampMaxNumElements(0, S16, 64);
1441 
1442   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1443   // pre-legalize.
1444   if (ST.hasVOP3PInsts()) {
1445     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1446       .customFor({V2S16, V2S16})
1447       .lower();
1448   } else
1449     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1450 
1451   // Merge/Unmerge
1452   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1453     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1454     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1455 
1456     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1457       const LLT Ty = Query.Types[TypeIdx];
1458       if (Ty.isVector()) {
1459         const LLT &EltTy = Ty.getElementType();
1460         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1461           return true;
1462         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1463           return true;
1464       }
1465       return false;
1466     };
1467 
1468     auto &Builder = getActionDefinitionsBuilder(Op)
1469       .lowerFor({{S16, V2S16}})
1470       .lowerIf([=](const LegalityQuery &Query) {
1471           const LLT BigTy = Query.Types[BigTyIdx];
1472           return BigTy.getSizeInBits() == 32;
1473         })
1474       // Try to widen to s16 first for small types.
1475       // TODO: Only do this on targets with legal s16 shifts
1476       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1477       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1478       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1479       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1480                            elementTypeIs(1, S16)),
1481                        changeTo(1, V2S16))
1482       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1483       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1484       // valid.
1485       .clampScalar(LitTyIdx, S32, S512)
1486       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1487       // Break up vectors with weird elements into scalars
1488       .fewerElementsIf(
1489         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1490         scalarize(0))
1491       .fewerElementsIf(
1492         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1493         scalarize(1))
1494       .clampScalar(BigTyIdx, S32, MaxScalar);
1495 
1496     if (Op == G_MERGE_VALUES) {
1497       Builder.widenScalarIf(
1498         // TODO: Use 16-bit shifts if legal for 8-bit values?
1499         [=](const LegalityQuery &Query) {
1500           const LLT Ty = Query.Types[LitTyIdx];
1501           return Ty.getSizeInBits() < 32;
1502         },
1503         changeTo(LitTyIdx, S32));
1504     }
1505 
1506     Builder.widenScalarIf(
1507       [=](const LegalityQuery &Query) {
1508         const LLT Ty = Query.Types[BigTyIdx];
1509         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1510           Ty.getSizeInBits() % 16 != 0;
1511       },
1512       [=](const LegalityQuery &Query) {
1513         // Pick the next power of 2, or a multiple of 64 over 128.
1514         // Whichever is smaller.
1515         const LLT &Ty = Query.Types[BigTyIdx];
1516         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1517         if (NewSizeInBits >= 256) {
1518           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1519           if (RoundedTo < NewSizeInBits)
1520             NewSizeInBits = RoundedTo;
1521         }
1522         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1523       })
1524       .legalIf([=](const LegalityQuery &Query) {
1525           const LLT &BigTy = Query.Types[BigTyIdx];
1526           const LLT &LitTy = Query.Types[LitTyIdx];
1527 
1528           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1529             return false;
1530           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1531             return false;
1532 
1533           return BigTy.getSizeInBits() % 16 == 0 &&
1534                  LitTy.getSizeInBits() % 16 == 0 &&
1535                  BigTy.getSizeInBits() <= MaxRegisterSize;
1536         })
1537       // Any vectors left are the wrong size. Scalarize them.
1538       .scalarize(0)
1539       .scalarize(1);
1540   }
1541 
1542   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1543   // RegBankSelect.
1544   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1545     .legalFor({{S32}, {S64}});
1546 
1547   if (ST.hasVOP3PInsts()) {
1548     SextInReg.lowerFor({{V2S16}})
1549       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1550       // get more vector shift opportunities, since we'll get those when
1551       // expanded.
1552       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1553   } else if (ST.has16BitInsts()) {
1554     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1555   } else {
1556     // Prefer to promote to s32 before lowering if we don't have 16-bit
1557     // shifts. This avoid a lot of intermediate truncate and extend operations.
1558     SextInReg.lowerFor({{S32}, {S64}});
1559   }
1560 
1561   SextInReg
1562     .scalarize(0)
1563     .clampScalar(0, S32, S64)
1564     .lower();
1565 
1566   getActionDefinitionsBuilder(G_FSHR)
1567     .legalFor({{S32, S32}})
1568     .scalarize(0)
1569     .lower();
1570 
1571   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1572     .legalFor({S64});
1573 
1574   getActionDefinitionsBuilder(G_FENCE)
1575     .alwaysLegal();
1576 
1577   getActionDefinitionsBuilder({
1578       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1579       G_FCOPYSIGN,
1580 
1581       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1582       G_ATOMICRMW_NAND,
1583       G_ATOMICRMW_FSUB,
1584       G_READ_REGISTER,
1585       G_WRITE_REGISTER,
1586 
1587       G_SADDO, G_SSUBO,
1588 
1589        // TODO: Implement
1590       G_FMINIMUM, G_FMAXIMUM,
1591       G_FSHL
1592     }).lower();
1593 
1594   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1595         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1596         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1597     .unsupported();
1598 
1599   computeTables();
1600   verify(*ST.getInstrInfo());
1601 }
1602 
1603 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1604                                          MachineInstr &MI) const {
1605   MachineIRBuilder &B = Helper.MIRBuilder;
1606   MachineRegisterInfo &MRI = *B.getMRI();
1607 
1608   switch (MI.getOpcode()) {
1609   case TargetOpcode::G_ADDRSPACE_CAST:
1610     return legalizeAddrSpaceCast(MI, MRI, B);
1611   case TargetOpcode::G_FRINT:
1612     return legalizeFrint(MI, MRI, B);
1613   case TargetOpcode::G_FCEIL:
1614     return legalizeFceil(MI, MRI, B);
1615   case TargetOpcode::G_FREM:
1616     return legalizeFrem(MI, MRI, B);
1617   case TargetOpcode::G_INTRINSIC_TRUNC:
1618     return legalizeIntrinsicTrunc(MI, MRI, B);
1619   case TargetOpcode::G_SITOFP:
1620     return legalizeITOFP(MI, MRI, B, true);
1621   case TargetOpcode::G_UITOFP:
1622     return legalizeITOFP(MI, MRI, B, false);
1623   case TargetOpcode::G_FPTOSI:
1624     return legalizeFPTOI(MI, MRI, B, true);
1625   case TargetOpcode::G_FPTOUI:
1626     return legalizeFPTOI(MI, MRI, B, false);
1627   case TargetOpcode::G_FMINNUM:
1628   case TargetOpcode::G_FMAXNUM:
1629   case TargetOpcode::G_FMINNUM_IEEE:
1630   case TargetOpcode::G_FMAXNUM_IEEE:
1631     return legalizeMinNumMaxNum(Helper, MI);
1632   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1633     return legalizeExtractVectorElt(MI, MRI, B);
1634   case TargetOpcode::G_INSERT_VECTOR_ELT:
1635     return legalizeInsertVectorElt(MI, MRI, B);
1636   case TargetOpcode::G_SHUFFLE_VECTOR:
1637     return legalizeShuffleVector(MI, MRI, B);
1638   case TargetOpcode::G_FSIN:
1639   case TargetOpcode::G_FCOS:
1640     return legalizeSinCos(MI, MRI, B);
1641   case TargetOpcode::G_GLOBAL_VALUE:
1642     return legalizeGlobalValue(MI, MRI, B);
1643   case TargetOpcode::G_LOAD:
1644     return legalizeLoad(Helper, MI);
1645   case TargetOpcode::G_FMAD:
1646     return legalizeFMad(MI, MRI, B);
1647   case TargetOpcode::G_FDIV:
1648     return legalizeFDIV(MI, MRI, B);
1649   case TargetOpcode::G_UDIV:
1650   case TargetOpcode::G_UREM:
1651     return legalizeUDIV_UREM(MI, MRI, B);
1652   case TargetOpcode::G_SDIV:
1653   case TargetOpcode::G_SREM:
1654     return legalizeSDIV_SREM(MI, MRI, B);
1655   case TargetOpcode::G_ATOMIC_CMPXCHG:
1656     return legalizeAtomicCmpXChg(MI, MRI, B);
1657   case TargetOpcode::G_FLOG:
1658     return legalizeFlog(MI, B, numbers::ln2f);
1659   case TargetOpcode::G_FLOG10:
1660     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1661   case TargetOpcode::G_FEXP:
1662     return legalizeFExp(MI, B);
1663   case TargetOpcode::G_FPOW:
1664     return legalizeFPow(MI, B);
1665   case TargetOpcode::G_FFLOOR:
1666     return legalizeFFloor(MI, MRI, B);
1667   case TargetOpcode::G_BUILD_VECTOR:
1668     return legalizeBuildVector(MI, MRI, B);
1669   default:
1670     return false;
1671   }
1672 
1673   llvm_unreachable("expected switch to return");
1674 }
1675 
1676 Register AMDGPULegalizerInfo::getSegmentAperture(
1677   unsigned AS,
1678   MachineRegisterInfo &MRI,
1679   MachineIRBuilder &B) const {
1680   MachineFunction &MF = B.getMF();
1681   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1682   const LLT S32 = LLT::scalar(32);
1683 
1684   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1685 
1686   if (ST.hasApertureRegs()) {
1687     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1688     // getreg.
1689     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1690         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1691         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1692     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1693         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1694         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1695     unsigned Encoding =
1696         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1697         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1698         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1699 
1700     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1701 
1702     B.buildInstr(AMDGPU::S_GETREG_B32)
1703       .addDef(GetReg)
1704       .addImm(Encoding);
1705     MRI.setType(GetReg, S32);
1706 
1707     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1708     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1709   }
1710 
1711   Register QueuePtr = MRI.createGenericVirtualRegister(
1712     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1713 
1714   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1715     return Register();
1716 
1717   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1718   // private_segment_aperture_base_hi.
1719   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1720 
1721   // TODO: can we be smarter about machine pointer info?
1722   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1723   MachineMemOperand *MMO = MF.getMachineMemOperand(
1724       PtrInfo,
1725       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1726           MachineMemOperand::MOInvariant,
1727       4, commonAlignment(Align(64), StructOffset));
1728 
1729   Register LoadAddr;
1730 
1731   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1732   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1736   MachineInstr &MI, MachineRegisterInfo &MRI,
1737   MachineIRBuilder &B) const {
1738   MachineFunction &MF = B.getMF();
1739 
1740   const LLT S32 = LLT::scalar(32);
1741   Register Dst = MI.getOperand(0).getReg();
1742   Register Src = MI.getOperand(1).getReg();
1743 
1744   LLT DstTy = MRI.getType(Dst);
1745   LLT SrcTy = MRI.getType(Src);
1746   unsigned DestAS = DstTy.getAddressSpace();
1747   unsigned SrcAS = SrcTy.getAddressSpace();
1748 
1749   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1750   // vector element.
1751   assert(!DstTy.isVector());
1752 
1753   const AMDGPUTargetMachine &TM
1754     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1755 
1756   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1757     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1758     return true;
1759   }
1760 
1761   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1762     // Truncate.
1763     B.buildExtract(Dst, Src, 0);
1764     MI.eraseFromParent();
1765     return true;
1766   }
1767 
1768   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1769     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1770     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1771 
1772     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1773     // another. Merge operands are required to be the same type, but creating an
1774     // extra ptrtoint would be kind of pointless.
1775     auto HighAddr = B.buildConstant(
1776       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1777     B.buildMerge(Dst, {Src, HighAddr});
1778     MI.eraseFromParent();
1779     return true;
1780   }
1781 
1782   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1783     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1784            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1785     unsigned NullVal = TM.getNullPointerValue(DestAS);
1786 
1787     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1788     auto FlatNull = B.buildConstant(SrcTy, 0);
1789 
1790     // Extract low 32-bits of the pointer.
1791     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1792 
1793     auto CmpRes =
1794         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1795     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1796 
1797     MI.eraseFromParent();
1798     return true;
1799   }
1800 
1801   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1802     return false;
1803 
1804   if (!ST.hasFlatAddressSpace())
1805     return false;
1806 
1807   auto SegmentNull =
1808       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1809   auto FlatNull =
1810       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1811 
1812   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1813   if (!ApertureReg.isValid())
1814     return false;
1815 
1816   auto CmpRes =
1817       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1818 
1819   // Coerce the type of the low half of the result so we can use merge_values.
1820   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1821 
1822   // TODO: Should we allow mismatched types but matching sizes in merges to
1823   // avoid the ptrtoint?
1824   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1825   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1826 
1827   MI.eraseFromParent();
1828   return true;
1829 }
1830 
1831 bool AMDGPULegalizerInfo::legalizeFrint(
1832   MachineInstr &MI, MachineRegisterInfo &MRI,
1833   MachineIRBuilder &B) const {
1834   Register Src = MI.getOperand(1).getReg();
1835   LLT Ty = MRI.getType(Src);
1836   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1837 
1838   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1839   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1840 
1841   auto C1 = B.buildFConstant(Ty, C1Val);
1842   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1843 
1844   // TODO: Should this propagate fast-math-flags?
1845   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1846   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1847 
1848   auto C2 = B.buildFConstant(Ty, C2Val);
1849   auto Fabs = B.buildFAbs(Ty, Src);
1850 
1851   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1852   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1853   MI.eraseFromParent();
1854   return true;
1855 }
1856 
1857 bool AMDGPULegalizerInfo::legalizeFceil(
1858   MachineInstr &MI, MachineRegisterInfo &MRI,
1859   MachineIRBuilder &B) const {
1860 
1861   const LLT S1 = LLT::scalar(1);
1862   const LLT S64 = LLT::scalar(64);
1863 
1864   Register Src = MI.getOperand(1).getReg();
1865   assert(MRI.getType(Src) == S64);
1866 
1867   // result = trunc(src)
1868   // if (src > 0.0 && src != result)
1869   //   result += 1.0
1870 
1871   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1872 
1873   const auto Zero = B.buildFConstant(S64, 0.0);
1874   const auto One = B.buildFConstant(S64, 1.0);
1875   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1876   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1877   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1878   auto Add = B.buildSelect(S64, And, One, Zero);
1879 
1880   // TODO: Should this propagate fast-math-flags?
1881   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1882   return true;
1883 }
1884 
1885 bool AMDGPULegalizerInfo::legalizeFrem(
1886   MachineInstr &MI, MachineRegisterInfo &MRI,
1887   MachineIRBuilder &B) const {
1888     Register DstReg = MI.getOperand(0).getReg();
1889     Register Src0Reg = MI.getOperand(1).getReg();
1890     Register Src1Reg = MI.getOperand(2).getReg();
1891     auto Flags = MI.getFlags();
1892     LLT Ty = MRI.getType(DstReg);
1893 
1894     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1895     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1896     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1897     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1898     MI.eraseFromParent();
1899     return true;
1900 }
1901 
1902 static MachineInstrBuilder extractF64Exponent(Register Hi,
1903                                               MachineIRBuilder &B) {
1904   const unsigned FractBits = 52;
1905   const unsigned ExpBits = 11;
1906   LLT S32 = LLT::scalar(32);
1907 
1908   auto Const0 = B.buildConstant(S32, FractBits - 32);
1909   auto Const1 = B.buildConstant(S32, ExpBits);
1910 
1911   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1912     .addUse(Hi)
1913     .addUse(Const0.getReg(0))
1914     .addUse(Const1.getReg(0));
1915 
1916   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1917 }
1918 
1919 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1920   MachineInstr &MI, MachineRegisterInfo &MRI,
1921   MachineIRBuilder &B) const {
1922   const LLT S1 = LLT::scalar(1);
1923   const LLT S32 = LLT::scalar(32);
1924   const LLT S64 = LLT::scalar(64);
1925 
1926   Register Src = MI.getOperand(1).getReg();
1927   assert(MRI.getType(Src) == S64);
1928 
1929   // TODO: Should this use extract since the low half is unused?
1930   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1931   Register Hi = Unmerge.getReg(1);
1932 
1933   // Extract the upper half, since this is where we will find the sign and
1934   // exponent.
1935   auto Exp = extractF64Exponent(Hi, B);
1936 
1937   const unsigned FractBits = 52;
1938 
1939   // Extract the sign bit.
1940   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1941   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1942 
1943   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1944 
1945   const auto Zero32 = B.buildConstant(S32, 0);
1946 
1947   // Extend back to 64-bits.
1948   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1949 
1950   auto Shr = B.buildAShr(S64, FractMask, Exp);
1951   auto Not = B.buildNot(S64, Shr);
1952   auto Tmp0 = B.buildAnd(S64, Src, Not);
1953   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1954 
1955   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1956   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1957 
1958   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1959   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1960   MI.eraseFromParent();
1961   return true;
1962 }
1963 
1964 bool AMDGPULegalizerInfo::legalizeITOFP(
1965   MachineInstr &MI, MachineRegisterInfo &MRI,
1966   MachineIRBuilder &B, bool Signed) const {
1967 
1968   Register Dst = MI.getOperand(0).getReg();
1969   Register Src = MI.getOperand(1).getReg();
1970 
1971   const LLT S64 = LLT::scalar(64);
1972   const LLT S32 = LLT::scalar(32);
1973 
1974   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1975 
1976   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1977 
1978   auto CvtHi = Signed ?
1979     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1980     B.buildUITOFP(S64, Unmerge.getReg(1));
1981 
1982   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1983 
1984   auto ThirtyTwo = B.buildConstant(S32, 32);
1985   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1986     .addUse(CvtHi.getReg(0))
1987     .addUse(ThirtyTwo.getReg(0));
1988 
1989   // TODO: Should this propagate fast-math-flags?
1990   B.buildFAdd(Dst, LdExp, CvtLo);
1991   MI.eraseFromParent();
1992   return true;
1993 }
1994 
1995 // TODO: Copied from DAG implementation. Verify logic and document how this
1996 // actually works.
1997 bool AMDGPULegalizerInfo::legalizeFPTOI(
1998   MachineInstr &MI, MachineRegisterInfo &MRI,
1999   MachineIRBuilder &B, bool Signed) const {
2000 
2001   Register Dst = MI.getOperand(0).getReg();
2002   Register Src = MI.getOperand(1).getReg();
2003 
2004   const LLT S64 = LLT::scalar(64);
2005   const LLT S32 = LLT::scalar(32);
2006 
2007   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2008 
2009   unsigned Flags = MI.getFlags();
2010 
2011   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2012   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2013   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2014 
2015   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2016   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2017   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2018 
2019   auto Hi = Signed ?
2020     B.buildFPTOSI(S32, FloorMul) :
2021     B.buildFPTOUI(S32, FloorMul);
2022   auto Lo = B.buildFPTOUI(S32, Fma);
2023 
2024   B.buildMerge(Dst, { Lo, Hi });
2025   MI.eraseFromParent();
2026 
2027   return true;
2028 }
2029 
2030 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2031                                                MachineInstr &MI) const {
2032   MachineFunction &MF = Helper.MIRBuilder.getMF();
2033   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2034 
2035   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2036                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2037 
2038   // With ieee_mode disabled, the instructions have the correct behavior
2039   // already for G_FMINNUM/G_FMAXNUM
2040   if (!MFI->getMode().IEEE)
2041     return !IsIEEEOp;
2042 
2043   if (IsIEEEOp)
2044     return true;
2045 
2046   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2050   MachineInstr &MI, MachineRegisterInfo &MRI,
2051   MachineIRBuilder &B) const {
2052   // TODO: Should move some of this into LegalizerHelper.
2053 
2054   // TODO: Promote dynamic indexing of s16 to s32
2055 
2056   // FIXME: Artifact combiner probably should have replaced the truncated
2057   // constant before this, so we shouldn't need
2058   // getConstantVRegValWithLookThrough.
2059   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2060     MI.getOperand(2).getReg(), MRI);
2061   if (!IdxVal) // Dynamic case will be selected to register indexing.
2062     return true;
2063 
2064   Register Dst = MI.getOperand(0).getReg();
2065   Register Vec = MI.getOperand(1).getReg();
2066 
2067   LLT VecTy = MRI.getType(Vec);
2068   LLT EltTy = VecTy.getElementType();
2069   assert(EltTy == MRI.getType(Dst));
2070 
2071   if (IdxVal->Value < VecTy.getNumElements())
2072     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2073   else
2074     B.buildUndef(Dst);
2075 
2076   MI.eraseFromParent();
2077   return true;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2081   MachineInstr &MI, MachineRegisterInfo &MRI,
2082   MachineIRBuilder &B) const {
2083   // TODO: Should move some of this into LegalizerHelper.
2084 
2085   // TODO: Promote dynamic indexing of s16 to s32
2086 
2087   // FIXME: Artifact combiner probably should have replaced the truncated
2088   // constant before this, so we shouldn't need
2089   // getConstantVRegValWithLookThrough.
2090   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2091     MI.getOperand(3).getReg(), MRI);
2092   if (!IdxVal) // Dynamic case will be selected to register indexing.
2093     return true;
2094 
2095   Register Dst = MI.getOperand(0).getReg();
2096   Register Vec = MI.getOperand(1).getReg();
2097   Register Ins = MI.getOperand(2).getReg();
2098 
2099   LLT VecTy = MRI.getType(Vec);
2100   LLT EltTy = VecTy.getElementType();
2101   assert(EltTy == MRI.getType(Ins));
2102 
2103   if (IdxVal->Value < VecTy.getNumElements())
2104     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2105   else
2106     B.buildUndef(Dst);
2107 
2108   MI.eraseFromParent();
2109   return true;
2110 }
2111 
2112 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2113   MachineInstr &MI, MachineRegisterInfo &MRI,
2114   MachineIRBuilder &B) const {
2115   const LLT V2S16 = LLT::vector(2, 16);
2116 
2117   Register Dst = MI.getOperand(0).getReg();
2118   Register Src0 = MI.getOperand(1).getReg();
2119   LLT DstTy = MRI.getType(Dst);
2120   LLT SrcTy = MRI.getType(Src0);
2121 
2122   if (SrcTy == V2S16 && DstTy == V2S16 &&
2123       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2124     return true;
2125 
2126   MachineIRBuilder HelperBuilder(MI);
2127   GISelObserverWrapper DummyObserver;
2128   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2129   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2130 }
2131 
2132 bool AMDGPULegalizerInfo::legalizeSinCos(
2133   MachineInstr &MI, MachineRegisterInfo &MRI,
2134   MachineIRBuilder &B) const {
2135 
2136   Register DstReg = MI.getOperand(0).getReg();
2137   Register SrcReg = MI.getOperand(1).getReg();
2138   LLT Ty = MRI.getType(DstReg);
2139   unsigned Flags = MI.getFlags();
2140 
2141   Register TrigVal;
2142   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2143   if (ST.hasTrigReducedRange()) {
2144     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2145     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2146       .addUse(MulVal.getReg(0))
2147       .setMIFlags(Flags).getReg(0);
2148   } else
2149     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2150 
2151   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2152     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2153   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2154     .addUse(TrigVal)
2155     .setMIFlags(Flags);
2156   MI.eraseFromParent();
2157   return true;
2158 }
2159 
2160 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2161                                                   MachineIRBuilder &B,
2162                                                   const GlobalValue *GV,
2163                                                   int64_t Offset,
2164                                                   unsigned GAFlags) const {
2165   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2166   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2167   // to the following code sequence:
2168   //
2169   // For constant address space:
2170   //   s_getpc_b64 s[0:1]
2171   //   s_add_u32 s0, s0, $symbol
2172   //   s_addc_u32 s1, s1, 0
2173   //
2174   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2175   //   a fixup or relocation is emitted to replace $symbol with a literal
2176   //   constant, which is a pc-relative offset from the encoding of the $symbol
2177   //   operand to the global variable.
2178   //
2179   // For global address space:
2180   //   s_getpc_b64 s[0:1]
2181   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2182   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2183   //
2184   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2185   //   fixups or relocations are emitted to replace $symbol@*@lo and
2186   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2187   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2188   //   operand to the global variable.
2189   //
2190   // What we want here is an offset from the value returned by s_getpc
2191   // (which is the address of the s_add_u32 instruction) to the global
2192   // variable, but since the encoding of $symbol starts 4 bytes after the start
2193   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2194   // small. This requires us to add 4 to the global variable offset in order to
2195   // compute the correct address.
2196 
2197   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2198 
2199   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2200     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2201 
2202   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2203     .addDef(PCReg);
2204 
2205   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2206   if (GAFlags == SIInstrInfo::MO_NONE)
2207     MIB.addImm(0);
2208   else
2209     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2210 
2211   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2212 
2213   if (PtrTy.getSizeInBits() == 32)
2214     B.buildExtract(DstReg, PCReg, 0);
2215   return true;
2216  }
2217 
2218 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2219   MachineInstr &MI, MachineRegisterInfo &MRI,
2220   MachineIRBuilder &B) const {
2221   Register DstReg = MI.getOperand(0).getReg();
2222   LLT Ty = MRI.getType(DstReg);
2223   unsigned AS = Ty.getAddressSpace();
2224 
2225   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2226   MachineFunction &MF = B.getMF();
2227   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2228 
2229   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2230     if (!MFI->isEntryFunction()) {
2231       const Function &Fn = MF.getFunction();
2232       DiagnosticInfoUnsupported BadLDSDecl(
2233         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2234         DS_Warning);
2235       Fn.getContext().diagnose(BadLDSDecl);
2236 
2237       // We currently don't have a way to correctly allocate LDS objects that
2238       // aren't directly associated with a kernel. We do force inlining of
2239       // functions that use local objects. However, if these dead functions are
2240       // not eliminated, we don't want a compile time error. Just emit a warning
2241       // and a trap, since there should be no callable path here.
2242       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2243       B.buildUndef(DstReg);
2244       MI.eraseFromParent();
2245       return true;
2246     }
2247 
2248     // TODO: We could emit code to handle the initialization somewhere.
2249     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2250       const SITargetLowering *TLI = ST.getTargetLowering();
2251       if (!TLI->shouldUseLDSConstAddress(GV)) {
2252         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2253         return true; // Leave in place;
2254       }
2255 
2256       B.buildConstant(
2257           DstReg,
2258           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2259       MI.eraseFromParent();
2260       return true;
2261     }
2262 
2263     const Function &Fn = MF.getFunction();
2264     DiagnosticInfoUnsupported BadInit(
2265       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2266     Fn.getContext().diagnose(BadInit);
2267     return true;
2268   }
2269 
2270   const SITargetLowering *TLI = ST.getTargetLowering();
2271 
2272   if (TLI->shouldEmitFixup(GV)) {
2273     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2274     MI.eraseFromParent();
2275     return true;
2276   }
2277 
2278   if (TLI->shouldEmitPCReloc(GV)) {
2279     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2280     MI.eraseFromParent();
2281     return true;
2282   }
2283 
2284   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2285   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2286 
2287   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2288       MachinePointerInfo::getGOT(MF),
2289       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2290           MachineMemOperand::MOInvariant,
2291       8 /*Size*/, Align(8));
2292 
2293   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2294 
2295   if (Ty.getSizeInBits() == 32) {
2296     // Truncate if this is a 32-bit constant adrdess.
2297     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2298     B.buildExtract(DstReg, Load, 0);
2299   } else
2300     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2301 
2302   MI.eraseFromParent();
2303   return true;
2304 }
2305 
2306 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2307                                        MachineInstr &MI) const {
2308   MachineIRBuilder &B = Helper.MIRBuilder;
2309   MachineRegisterInfo &MRI = *B.getMRI();
2310   GISelChangeObserver &Observer = Helper.Observer;
2311 
2312   Register PtrReg = MI.getOperand(1).getReg();
2313   LLT PtrTy = MRI.getType(PtrReg);
2314   unsigned AddrSpace = PtrTy.getAddressSpace();
2315 
2316   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2317     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2318     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2319     Observer.changingInstr(MI);
2320     MI.getOperand(1).setReg(Cast.getReg(0));
2321     Observer.changedInstr(MI);
2322     return true;
2323   }
2324 
2325   return false;
2326 }
2327 
2328 bool AMDGPULegalizerInfo::legalizeFMad(
2329   MachineInstr &MI, MachineRegisterInfo &MRI,
2330   MachineIRBuilder &B) const {
2331   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2332   assert(Ty.isScalar());
2333 
2334   MachineFunction &MF = B.getMF();
2335   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2336 
2337   // TODO: Always legal with future ftz flag.
2338   // FIXME: Do we need just output?
2339   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2340     return true;
2341   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2342     return true;
2343 
2344   MachineIRBuilder HelperBuilder(MI);
2345   GISelObserverWrapper DummyObserver;
2346   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2347   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2348 }
2349 
2350 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2351   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2352   Register DstReg = MI.getOperand(0).getReg();
2353   Register PtrReg = MI.getOperand(1).getReg();
2354   Register CmpVal = MI.getOperand(2).getReg();
2355   Register NewVal = MI.getOperand(3).getReg();
2356 
2357   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2358          "this should not have been custom lowered");
2359 
2360   LLT ValTy = MRI.getType(CmpVal);
2361   LLT VecTy = LLT::vector(2, ValTy);
2362 
2363   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2364 
2365   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2366     .addDef(DstReg)
2367     .addUse(PtrReg)
2368     .addUse(PackedVal)
2369     .setMemRefs(MI.memoperands());
2370 
2371   MI.eraseFromParent();
2372   return true;
2373 }
2374 
2375 bool AMDGPULegalizerInfo::legalizeFlog(
2376   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2377   Register Dst = MI.getOperand(0).getReg();
2378   Register Src = MI.getOperand(1).getReg();
2379   LLT Ty = B.getMRI()->getType(Dst);
2380   unsigned Flags = MI.getFlags();
2381 
2382   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2383   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2384 
2385   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2386   MI.eraseFromParent();
2387   return true;
2388 }
2389 
2390 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2391                                        MachineIRBuilder &B) const {
2392   Register Dst = MI.getOperand(0).getReg();
2393   Register Src = MI.getOperand(1).getReg();
2394   unsigned Flags = MI.getFlags();
2395   LLT Ty = B.getMRI()->getType(Dst);
2396 
2397   auto K = B.buildFConstant(Ty, numbers::log2e);
2398   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2399   B.buildFExp2(Dst, Mul, Flags);
2400   MI.eraseFromParent();
2401   return true;
2402 }
2403 
2404 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2405                                        MachineIRBuilder &B) const {
2406   Register Dst = MI.getOperand(0).getReg();
2407   Register Src0 = MI.getOperand(1).getReg();
2408   Register Src1 = MI.getOperand(2).getReg();
2409   unsigned Flags = MI.getFlags();
2410   LLT Ty = B.getMRI()->getType(Dst);
2411   const LLT S16 = LLT::scalar(16);
2412   const LLT S32 = LLT::scalar(32);
2413 
2414   if (Ty == S32) {
2415     auto Log = B.buildFLog2(S32, Src0, Flags);
2416     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2417       .addUse(Log.getReg(0))
2418       .addUse(Src1)
2419       .setMIFlags(Flags);
2420     B.buildFExp2(Dst, Mul, Flags);
2421   } else if (Ty == S16) {
2422     // There's no f16 fmul_legacy, so we need to convert for it.
2423     auto Log = B.buildFLog2(S16, Src0, Flags);
2424     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2425     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2426     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2427       .addUse(Ext0.getReg(0))
2428       .addUse(Ext1.getReg(0))
2429       .setMIFlags(Flags);
2430 
2431     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2432   } else
2433     return false;
2434 
2435   MI.eraseFromParent();
2436   return true;
2437 }
2438 
2439 // Find a source register, ignoring any possible source modifiers.
2440 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2441   Register ModSrc = OrigSrc;
2442   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2443     ModSrc = SrcFNeg->getOperand(1).getReg();
2444     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2445       ModSrc = SrcFAbs->getOperand(1).getReg();
2446   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2447     ModSrc = SrcFAbs->getOperand(1).getReg();
2448   return ModSrc;
2449 }
2450 
2451 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2452                                          MachineRegisterInfo &MRI,
2453                                          MachineIRBuilder &B) const {
2454 
2455   const LLT S1 = LLT::scalar(1);
2456   const LLT S64 = LLT::scalar(64);
2457   Register Dst = MI.getOperand(0).getReg();
2458   Register OrigSrc = MI.getOperand(1).getReg();
2459   unsigned Flags = MI.getFlags();
2460   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2461          "this should not have been custom lowered");
2462 
2463   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2464   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2465   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2466   // V_FRACT bug is:
2467   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2468   //
2469   // Convert floor(x) to (x - fract(x))
2470 
2471   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2472     .addUse(OrigSrc)
2473     .setMIFlags(Flags);
2474 
2475   // Give source modifier matching some assistance before obscuring a foldable
2476   // pattern.
2477 
2478   // TODO: We can avoid the neg on the fract? The input sign to fract
2479   // shouldn't matter?
2480   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2481 
2482   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2483 
2484   Register Min = MRI.createGenericVirtualRegister(S64);
2485 
2486   // We don't need to concern ourselves with the snan handling difference, so
2487   // use the one which will directly select.
2488   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2489   if (MFI->getMode().IEEE)
2490     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2491   else
2492     B.buildFMinNum(Min, Fract, Const, Flags);
2493 
2494   Register CorrectedFract = Min;
2495   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2496     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2497     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2498   }
2499 
2500   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2501   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2502 
2503   MI.eraseFromParent();
2504   return true;
2505 }
2506 
2507 // Turn an illegal packed v2s16 build vector into bit operations.
2508 // TODO: This should probably be a bitcast action in LegalizerHelper.
2509 bool AMDGPULegalizerInfo::legalizeBuildVector(
2510   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2511   Register Dst = MI.getOperand(0).getReg();
2512   const LLT S32 = LLT::scalar(32);
2513   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2514 
2515   Register Src0 = MI.getOperand(1).getReg();
2516   Register Src1 = MI.getOperand(2).getReg();
2517   assert(MRI.getType(Src0) == LLT::scalar(16));
2518 
2519   auto Merge = B.buildMerge(S32, {Src0, Src1});
2520   B.buildBitcast(Dst, Merge);
2521 
2522   MI.eraseFromParent();
2523   return true;
2524 }
2525 
2526 // Return the use branch instruction, otherwise null if the usage is invalid.
2527 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2528                                        MachineRegisterInfo &MRI,
2529                                        MachineInstr *&Br,
2530                                        MachineBasicBlock *&UncondBrTarget) {
2531   Register CondDef = MI.getOperand(0).getReg();
2532   if (!MRI.hasOneNonDBGUse(CondDef))
2533     return nullptr;
2534 
2535   MachineBasicBlock *Parent = MI.getParent();
2536   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2537   if (UseMI.getParent() != Parent ||
2538       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2539     return nullptr;
2540 
2541   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2542   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2543   if (Next == Parent->end()) {
2544     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2545     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2546       return nullptr;
2547     UncondBrTarget = &*NextMBB;
2548   } else {
2549     if (Next->getOpcode() != AMDGPU::G_BR)
2550       return nullptr;
2551     Br = &*Next;
2552     UncondBrTarget = Br->getOperand(0).getMBB();
2553   }
2554 
2555   return &UseMI;
2556 }
2557 
2558 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2559                                          const ArgDescriptor *Arg,
2560                                          const TargetRegisterClass *ArgRC,
2561                                          LLT ArgTy) const {
2562   MCRegister SrcReg = Arg->getRegister();
2563   assert(SrcReg.isPhysical() && "Physical register expected");
2564   assert(DstReg.isVirtual() && "Virtual register expected");
2565 
2566   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2567                                              ArgTy);
2568   if (Arg->isMasked()) {
2569     // TODO: Should we try to emit this once in the entry block?
2570     const LLT S32 = LLT::scalar(32);
2571     const unsigned Mask = Arg->getMask();
2572     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2573 
2574     Register AndMaskSrc = LiveIn;
2575 
2576     if (Shift != 0) {
2577       auto ShiftAmt = B.buildConstant(S32, Shift);
2578       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2579     }
2580 
2581     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2582   } else {
2583     B.buildCopy(DstReg, LiveIn);
2584   }
2585 
2586   return true;
2587 }
2588 
2589 bool AMDGPULegalizerInfo::loadInputValue(
2590     Register DstReg, MachineIRBuilder &B,
2591     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2592   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2593   const ArgDescriptor *Arg;
2594   const TargetRegisterClass *ArgRC;
2595   LLT ArgTy;
2596   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2597 
2598   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2599     return false; // TODO: Handle these
2600   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2601 }
2602 
2603 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2604     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2605     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2606   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2607     return false;
2608 
2609   MI.eraseFromParent();
2610   return true;
2611 }
2612 
2613 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2614                                        MachineRegisterInfo &MRI,
2615                                        MachineIRBuilder &B) const {
2616   Register Dst = MI.getOperand(0).getReg();
2617   LLT DstTy = MRI.getType(Dst);
2618   LLT S16 = LLT::scalar(16);
2619   LLT S32 = LLT::scalar(32);
2620   LLT S64 = LLT::scalar(64);
2621 
2622   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2623     return true;
2624 
2625   if (DstTy == S16)
2626     return legalizeFDIV16(MI, MRI, B);
2627   if (DstTy == S32)
2628     return legalizeFDIV32(MI, MRI, B);
2629   if (DstTy == S64)
2630     return legalizeFDIV64(MI, MRI, B);
2631 
2632   return false;
2633 }
2634 
2635 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2636                                                   Register DstReg,
2637                                                   Register X,
2638                                                   Register Y,
2639                                                   bool IsDiv) const {
2640   const LLT S1 = LLT::scalar(1);
2641   const LLT S32 = LLT::scalar(32);
2642 
2643   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2644   // algorithm used here.
2645 
2646   // Initial estimate of inv(y).
2647   auto FloatY = B.buildUITOFP(S32, Y);
2648   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2649   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2650   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2651   auto Z = B.buildFPTOUI(S32, ScaledY);
2652 
2653   // One round of UNR.
2654   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2655   auto NegYZ = B.buildMul(S32, NegY, Z);
2656   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2657 
2658   // Quotient/remainder estimate.
2659   auto Q = B.buildUMulH(S32, X, Z);
2660   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2661 
2662   // First quotient/remainder refinement.
2663   auto One = B.buildConstant(S32, 1);
2664   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2665   if (IsDiv)
2666     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2667   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2668 
2669   // Second quotient/remainder refinement.
2670   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2671   if (IsDiv)
2672     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2673   else
2674     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2675 }
2676 
2677 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2678                                               MachineRegisterInfo &MRI,
2679                                               MachineIRBuilder &B) const {
2680   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2681   Register DstReg = MI.getOperand(0).getReg();
2682   Register Num = MI.getOperand(1).getReg();
2683   Register Den = MI.getOperand(2).getReg();
2684   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2685   MI.eraseFromParent();
2686   return true;
2687 }
2688 
2689 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2690 //
2691 // Return lo, hi of result
2692 //
2693 // %cvt.lo = G_UITOFP Val.lo
2694 // %cvt.hi = G_UITOFP Val.hi
2695 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2696 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2697 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2698 // %mul2 = G_FMUL %mul1, 2**(-32)
2699 // %trunc = G_INTRINSIC_TRUNC %mul2
2700 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2701 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2702 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2703                                                        Register Val) {
2704   const LLT S32 = LLT::scalar(32);
2705   auto Unmerge = B.buildUnmerge(S32, Val);
2706 
2707   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2708   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2709 
2710   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2711                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2712 
2713   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2714   auto Mul1 =
2715       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2716 
2717   // 2**(-32)
2718   auto Mul2 =
2719       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2720   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2721 
2722   // -(2**32)
2723   auto Mad2 = B.buildFMAD(S32, Trunc,
2724                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2725 
2726   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2727   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2728 
2729   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2730 }
2731 
2732 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2733                                                   Register DstReg,
2734                                                   Register Numer,
2735                                                   Register Denom,
2736                                                   bool IsDiv) const {
2737   const LLT S32 = LLT::scalar(32);
2738   const LLT S64 = LLT::scalar(64);
2739   const LLT S1 = LLT::scalar(1);
2740   Register RcpLo, RcpHi;
2741 
2742   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2743 
2744   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2745 
2746   auto Zero64 = B.buildConstant(S64, 0);
2747   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2748 
2749   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2750   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2751 
2752   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2753   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2754   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2755 
2756   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2757   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2758   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2759   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2760 
2761   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2762   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2763   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2764   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2765   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2766 
2767   auto Zero32 = B.buildConstant(S32, 0);
2768   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2769   auto Add2_HiC =
2770       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2771   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2772   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2773 
2774   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2775   Register NumerLo = UnmergeNumer.getReg(0);
2776   Register NumerHi = UnmergeNumer.getReg(1);
2777 
2778   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2779   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2780   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2781   Register Mul3_Lo = UnmergeMul3.getReg(0);
2782   Register Mul3_Hi = UnmergeMul3.getReg(1);
2783   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2784   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2785   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2786   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2787 
2788   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2789   Register DenomLo = UnmergeDenom.getReg(0);
2790   Register DenomHi = UnmergeDenom.getReg(1);
2791 
2792   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2793   auto C1 = B.buildSExt(S32, CmpHi);
2794 
2795   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2796   auto C2 = B.buildSExt(S32, CmpLo);
2797 
2798   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2799   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2800 
2801   // TODO: Here and below portions of the code can be enclosed into if/endif.
2802   // Currently control flow is unconditional and we have 4 selects after
2803   // potential endif to substitute PHIs.
2804 
2805   // if C3 != 0 ...
2806   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2807   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2808   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2809   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2810 
2811   auto One64 = B.buildConstant(S64, 1);
2812   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2813 
2814   auto C4 =
2815       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2816   auto C5 =
2817       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2818   auto C6 = B.buildSelect(
2819       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2820 
2821   // if (C6 != 0)
2822   auto Add4 = B.buildAdd(S64, Add3, One64);
2823   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2824 
2825   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2826   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2827   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2828 
2829   // endif C6
2830   // endif C3
2831 
2832   if (IsDiv) {
2833     auto Sel1 = B.buildSelect(
2834         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2835     B.buildSelect(DstReg,
2836                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2837   } else {
2838     auto Sel2 = B.buildSelect(
2839         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2840     B.buildSelect(DstReg,
2841                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2842   }
2843 }
2844 
2845 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2846                                             MachineRegisterInfo &MRI,
2847                                             MachineIRBuilder &B) const {
2848   const LLT S64 = LLT::scalar(64);
2849   const LLT S32 = LLT::scalar(32);
2850   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2851   Register DstReg = MI.getOperand(0).getReg();
2852   Register Num = MI.getOperand(1).getReg();
2853   Register Den = MI.getOperand(2).getReg();
2854   LLT Ty = MRI.getType(DstReg);
2855 
2856   if (Ty == S32)
2857     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2858   else if (Ty == S64)
2859     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2860   else
2861     return false;
2862 
2863   MI.eraseFromParent();
2864   return true;
2865 
2866 }
2867 
2868 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2869                                             MachineRegisterInfo &MRI,
2870                                             MachineIRBuilder &B) const {
2871   const LLT S64 = LLT::scalar(64);
2872   const LLT S32 = LLT::scalar(32);
2873 
2874   Register DstReg = MI.getOperand(0).getReg();
2875   const LLT Ty = MRI.getType(DstReg);
2876   if (Ty != S32 && Ty != S64)
2877     return false;
2878 
2879   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2880 
2881   Register LHS = MI.getOperand(1).getReg();
2882   Register RHS = MI.getOperand(2).getReg();
2883 
2884   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2885   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2886   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2887 
2888   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2889   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2890 
2891   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2892   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2893 
2894   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2895   if (Ty == S32)
2896     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2897   else
2898     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2899 
2900   Register Sign;
2901   if (IsDiv)
2902     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2903   else
2904     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2905 
2906   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2907   B.buildSub(DstReg, UDivRem, Sign);
2908 
2909   MI.eraseFromParent();
2910   return true;
2911 }
2912 
2913 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2914                                                  MachineRegisterInfo &MRI,
2915                                                  MachineIRBuilder &B) const {
2916   Register Res = MI.getOperand(0).getReg();
2917   Register LHS = MI.getOperand(1).getReg();
2918   Register RHS = MI.getOperand(2).getReg();
2919 
2920   uint16_t Flags = MI.getFlags();
2921 
2922   LLT ResTy = MRI.getType(Res);
2923   LLT S32 = LLT::scalar(32);
2924   LLT S64 = LLT::scalar(64);
2925 
2926   const MachineFunction &MF = B.getMF();
2927   bool Unsafe =
2928     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2929 
2930   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2931     return false;
2932 
2933   if (!Unsafe && ResTy == S32 &&
2934       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2935     return false;
2936 
2937   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2938     // 1 / x -> RCP(x)
2939     if (CLHS->isExactlyValue(1.0)) {
2940       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2941         .addUse(RHS)
2942         .setMIFlags(Flags);
2943 
2944       MI.eraseFromParent();
2945       return true;
2946     }
2947 
2948     // -1 / x -> RCP( FNEG(x) )
2949     if (CLHS->isExactlyValue(-1.0)) {
2950       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2951       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2952         .addUse(FNeg.getReg(0))
2953         .setMIFlags(Flags);
2954 
2955       MI.eraseFromParent();
2956       return true;
2957     }
2958   }
2959 
2960   // x / y -> x * (1.0 / y)
2961   if (Unsafe) {
2962     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2963       .addUse(RHS)
2964       .setMIFlags(Flags);
2965     B.buildFMul(Res, LHS, RCP, Flags);
2966 
2967     MI.eraseFromParent();
2968     return true;
2969   }
2970 
2971   return false;
2972 }
2973 
2974 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2975                                          MachineRegisterInfo &MRI,
2976                                          MachineIRBuilder &B) const {
2977   Register Res = MI.getOperand(0).getReg();
2978   Register LHS = MI.getOperand(1).getReg();
2979   Register RHS = MI.getOperand(2).getReg();
2980 
2981   uint16_t Flags = MI.getFlags();
2982 
2983   LLT S16 = LLT::scalar(16);
2984   LLT S32 = LLT::scalar(32);
2985 
2986   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2987   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2988 
2989   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2990     .addUse(RHSExt.getReg(0))
2991     .setMIFlags(Flags);
2992 
2993   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2994   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2995 
2996   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2997     .addUse(RDst.getReg(0))
2998     .addUse(RHS)
2999     .addUse(LHS)
3000     .setMIFlags(Flags);
3001 
3002   MI.eraseFromParent();
3003   return true;
3004 }
3005 
3006 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3007 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3008 static void toggleSPDenormMode(bool Enable,
3009                                MachineIRBuilder &B,
3010                                const GCNSubtarget &ST,
3011                                AMDGPU::SIModeRegisterDefaults Mode) {
3012   // Set SP denorm mode to this value.
3013   unsigned SPDenormMode =
3014     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3015 
3016   if (ST.hasDenormModeInst()) {
3017     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3018     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3019 
3020     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3021     B.buildInstr(AMDGPU::S_DENORM_MODE)
3022       .addImm(NewDenormModeValue);
3023 
3024   } else {
3025     // Select FP32 bit field in mode register.
3026     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3027                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3028                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3029 
3030     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3031       .addImm(SPDenormMode)
3032       .addImm(SPDenormModeBitField);
3033   }
3034 }
3035 
3036 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3037                                          MachineRegisterInfo &MRI,
3038                                          MachineIRBuilder &B) const {
3039   Register Res = MI.getOperand(0).getReg();
3040   Register LHS = MI.getOperand(1).getReg();
3041   Register RHS = MI.getOperand(2).getReg();
3042   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3043   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3044 
3045   uint16_t Flags = MI.getFlags();
3046 
3047   LLT S32 = LLT::scalar(32);
3048   LLT S1 = LLT::scalar(1);
3049 
3050   auto One = B.buildFConstant(S32, 1.0f);
3051 
3052   auto DenominatorScaled =
3053     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3054       .addUse(LHS)
3055       .addUse(RHS)
3056       .addImm(0)
3057       .setMIFlags(Flags);
3058   auto NumeratorScaled =
3059     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3060       .addUse(LHS)
3061       .addUse(RHS)
3062       .addImm(1)
3063       .setMIFlags(Flags);
3064 
3065   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3066     .addUse(DenominatorScaled.getReg(0))
3067     .setMIFlags(Flags);
3068   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3069 
3070   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3071   // aren't modeled as reading it.
3072   if (!Mode.allFP32Denormals())
3073     toggleSPDenormMode(true, B, ST, Mode);
3074 
3075   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3076   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3077   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3078   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3079   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3080   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3081 
3082   if (!Mode.allFP32Denormals())
3083     toggleSPDenormMode(false, B, ST, Mode);
3084 
3085   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3086     .addUse(Fma4.getReg(0))
3087     .addUse(Fma1.getReg(0))
3088     .addUse(Fma3.getReg(0))
3089     .addUse(NumeratorScaled.getReg(1))
3090     .setMIFlags(Flags);
3091 
3092   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3093     .addUse(Fmas.getReg(0))
3094     .addUse(RHS)
3095     .addUse(LHS)
3096     .setMIFlags(Flags);
3097 
3098   MI.eraseFromParent();
3099   return true;
3100 }
3101 
3102 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3103                                          MachineRegisterInfo &MRI,
3104                                          MachineIRBuilder &B) const {
3105   Register Res = MI.getOperand(0).getReg();
3106   Register LHS = MI.getOperand(1).getReg();
3107   Register RHS = MI.getOperand(2).getReg();
3108 
3109   uint16_t Flags = MI.getFlags();
3110 
3111   LLT S64 = LLT::scalar(64);
3112   LLT S1 = LLT::scalar(1);
3113 
3114   auto One = B.buildFConstant(S64, 1.0);
3115 
3116   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3117     .addUse(LHS)
3118     .addUse(RHS)
3119     .addImm(0)
3120     .setMIFlags(Flags);
3121 
3122   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3123 
3124   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3125     .addUse(DivScale0.getReg(0))
3126     .setMIFlags(Flags);
3127 
3128   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3129   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3130   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3131 
3132   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3133     .addUse(LHS)
3134     .addUse(RHS)
3135     .addImm(1)
3136     .setMIFlags(Flags);
3137 
3138   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3139   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3140   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3141 
3142   Register Scale;
3143   if (!ST.hasUsableDivScaleConditionOutput()) {
3144     // Workaround a hardware bug on SI where the condition output from div_scale
3145     // is not usable.
3146 
3147     LLT S32 = LLT::scalar(32);
3148 
3149     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3150     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3151     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3152     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3153 
3154     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3155                               Scale1Unmerge.getReg(1));
3156     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3157                               Scale0Unmerge.getReg(1));
3158     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3159   } else {
3160     Scale = DivScale1.getReg(1);
3161   }
3162 
3163   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3164     .addUse(Fma4.getReg(0))
3165     .addUse(Fma3.getReg(0))
3166     .addUse(Mul.getReg(0))
3167     .addUse(Scale)
3168     .setMIFlags(Flags);
3169 
3170   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3171     .addUse(Fmas.getReg(0))
3172     .addUse(RHS)
3173     .addUse(LHS)
3174     .setMIFlags(Flags);
3175 
3176   MI.eraseFromParent();
3177   return true;
3178 }
3179 
3180 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3181                                                  MachineRegisterInfo &MRI,
3182                                                  MachineIRBuilder &B) const {
3183   Register Res = MI.getOperand(0).getReg();
3184   Register LHS = MI.getOperand(2).getReg();
3185   Register RHS = MI.getOperand(3).getReg();
3186   uint16_t Flags = MI.getFlags();
3187 
3188   LLT S32 = LLT::scalar(32);
3189   LLT S1 = LLT::scalar(1);
3190 
3191   auto Abs = B.buildFAbs(S32, RHS, Flags);
3192   const APFloat C0Val(1.0f);
3193 
3194   auto C0 = B.buildConstant(S32, 0x6f800000);
3195   auto C1 = B.buildConstant(S32, 0x2f800000);
3196   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3197 
3198   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3199   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3200 
3201   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3202 
3203   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3204     .addUse(Mul0.getReg(0))
3205     .setMIFlags(Flags);
3206 
3207   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3208 
3209   B.buildFMul(Res, Sel, Mul1, Flags);
3210 
3211   MI.eraseFromParent();
3212   return true;
3213 }
3214 
3215 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3216 // FIXME: Why do we handle this one but not other removed instructions?
3217 //
3218 // Reciprocal square root.  The clamp prevents infinite results, clamping
3219 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3220 // +-max_float.
3221 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3222                                                     MachineRegisterInfo &MRI,
3223                                                     MachineIRBuilder &B) const {
3224   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3225     return true;
3226 
3227   Register Dst = MI.getOperand(0).getReg();
3228   Register Src = MI.getOperand(2).getReg();
3229   auto Flags = MI.getFlags();
3230 
3231   LLT Ty = MRI.getType(Dst);
3232 
3233   const fltSemantics *FltSemantics;
3234   if (Ty == LLT::scalar(32))
3235     FltSemantics = &APFloat::IEEEsingle();
3236   else if (Ty == LLT::scalar(64))
3237     FltSemantics = &APFloat::IEEEdouble();
3238   else
3239     return false;
3240 
3241   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3242     .addUse(Src)
3243     .setMIFlags(Flags);
3244 
3245   // We don't need to concern ourselves with the snan handling difference, since
3246   // the rsq quieted (or not) so use the one which will directly select.
3247   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3248   const bool UseIEEE = MFI->getMode().IEEE;
3249 
3250   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3251   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3252                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3253 
3254   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3255 
3256   if (UseIEEE)
3257     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3258   else
3259     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3260   MI.eraseFromParent();
3261   return true;
3262 }
3263 
3264 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3265   switch (IID) {
3266   case Intrinsic::amdgcn_ds_fadd:
3267     return AMDGPU::G_ATOMICRMW_FADD;
3268   case Intrinsic::amdgcn_ds_fmin:
3269     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3270   case Intrinsic::amdgcn_ds_fmax:
3271     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3272   default:
3273     llvm_unreachable("not a DS FP intrinsic");
3274   }
3275 }
3276 
3277 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3278                                                       MachineInstr &MI,
3279                                                       Intrinsic::ID IID) const {
3280   GISelChangeObserver &Observer = Helper.Observer;
3281   Observer.changingInstr(MI);
3282 
3283   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3284 
3285   // The remaining operands were used to set fields in the MemOperand on
3286   // construction.
3287   for (int I = 6; I > 3; --I)
3288     MI.RemoveOperand(I);
3289 
3290   MI.RemoveOperand(1); // Remove the intrinsic ID.
3291   Observer.changedInstr(MI);
3292   return true;
3293 }
3294 
3295 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3296                                             MachineRegisterInfo &MRI,
3297                                             MachineIRBuilder &B) const {
3298   uint64_t Offset =
3299     ST.getTargetLowering()->getImplicitParameterOffset(
3300       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3301   LLT DstTy = MRI.getType(DstReg);
3302   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3303 
3304   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3305   if (!loadInputValue(KernargPtrReg, B,
3306                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3307     return false;
3308 
3309   // FIXME: This should be nuw
3310   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3311   return true;
3312 }
3313 
3314 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3315                                                  MachineRegisterInfo &MRI,
3316                                                  MachineIRBuilder &B) const {
3317   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3318   if (!MFI->isEntryFunction()) {
3319     return legalizePreloadedArgIntrin(MI, MRI, B,
3320                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3321   }
3322 
3323   Register DstReg = MI.getOperand(0).getReg();
3324   if (!getImplicitArgPtr(DstReg, MRI, B))
3325     return false;
3326 
3327   MI.eraseFromParent();
3328   return true;
3329 }
3330 
3331 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3332                                               MachineRegisterInfo &MRI,
3333                                               MachineIRBuilder &B,
3334                                               unsigned AddrSpace) const {
3335   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3336   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3337   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3338   MI.eraseFromParent();
3339   return true;
3340 }
3341 
3342 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3343 // offset (the offset that is included in bounds checking and swizzling, to be
3344 // split between the instruction's voffset and immoffset fields) and soffset
3345 // (the offset that is excluded from bounds checking and swizzling, to go in
3346 // the instruction's soffset field).  This function takes the first kind of
3347 // offset and figures out how to split it between voffset and immoffset.
3348 std::tuple<Register, unsigned, unsigned>
3349 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3350                                         Register OrigOffset) const {
3351   const unsigned MaxImm = 4095;
3352   Register BaseReg;
3353   unsigned TotalConstOffset;
3354   MachineInstr *OffsetDef;
3355   const LLT S32 = LLT::scalar(32);
3356 
3357   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3358     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3359 
3360   unsigned ImmOffset = TotalConstOffset;
3361 
3362   // If the immediate value is too big for the immoffset field, put the value
3363   // and -4096 into the immoffset field so that the value that is copied/added
3364   // for the voffset field is a multiple of 4096, and it stands more chance
3365   // of being CSEd with the copy/add for another similar load/store.
3366   // However, do not do that rounding down to a multiple of 4096 if that is a
3367   // negative number, as it appears to be illegal to have a negative offset
3368   // in the vgpr, even if adding the immediate offset makes it positive.
3369   unsigned Overflow = ImmOffset & ~MaxImm;
3370   ImmOffset -= Overflow;
3371   if ((int32_t)Overflow < 0) {
3372     Overflow += ImmOffset;
3373     ImmOffset = 0;
3374   }
3375 
3376   if (Overflow != 0) {
3377     if (!BaseReg) {
3378       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3379     } else {
3380       auto OverflowVal = B.buildConstant(S32, Overflow);
3381       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3382     }
3383   }
3384 
3385   if (!BaseReg)
3386     BaseReg = B.buildConstant(S32, 0).getReg(0);
3387 
3388   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3389 }
3390 
3391 /// Handle register layout difference for f16 images for some subtargets.
3392 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3393                                              MachineRegisterInfo &MRI,
3394                                              Register Reg) const {
3395   if (!ST.hasUnpackedD16VMem())
3396     return Reg;
3397 
3398   const LLT S16 = LLT::scalar(16);
3399   const LLT S32 = LLT::scalar(32);
3400   LLT StoreVT = MRI.getType(Reg);
3401   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3402 
3403   auto Unmerge = B.buildUnmerge(S16, Reg);
3404 
3405   SmallVector<Register, 4> WideRegs;
3406   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3407     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3408 
3409   int NumElts = StoreVT.getNumElements();
3410 
3411   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3412 }
3413 
3414 Register AMDGPULegalizerInfo::fixStoreSourceType(
3415   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3416   MachineRegisterInfo *MRI = B.getMRI();
3417   LLT Ty = MRI->getType(VData);
3418 
3419   const LLT S16 = LLT::scalar(16);
3420 
3421   // Fixup illegal register types for i8 stores.
3422   if (Ty == LLT::scalar(8) || Ty == S16) {
3423     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3424     return AnyExt;
3425   }
3426 
3427   if (Ty.isVector()) {
3428     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3429       if (IsFormat)
3430         return handleD16VData(B, *MRI, VData);
3431     }
3432   }
3433 
3434   return VData;
3435 }
3436 
3437 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3438                                               MachineRegisterInfo &MRI,
3439                                               MachineIRBuilder &B,
3440                                               bool IsTyped,
3441                                               bool IsFormat) const {
3442   Register VData = MI.getOperand(1).getReg();
3443   LLT Ty = MRI.getType(VData);
3444   LLT EltTy = Ty.getScalarType();
3445   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3446   const LLT S32 = LLT::scalar(32);
3447 
3448   VData = fixStoreSourceType(B, VData, IsFormat);
3449   Register RSrc = MI.getOperand(2).getReg();
3450 
3451   MachineMemOperand *MMO = *MI.memoperands_begin();
3452   const int MemSize = MMO->getSize();
3453 
3454   unsigned ImmOffset;
3455   unsigned TotalOffset;
3456 
3457   // The typed intrinsics add an immediate after the registers.
3458   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3459 
3460   // The struct intrinsic variants add one additional operand over raw.
3461   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3462   Register VIndex;
3463   int OpOffset = 0;
3464   if (HasVIndex) {
3465     VIndex = MI.getOperand(3).getReg();
3466     OpOffset = 1;
3467   }
3468 
3469   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3470   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3471 
3472   unsigned Format = 0;
3473   if (IsTyped) {
3474     Format = MI.getOperand(5 + OpOffset).getImm();
3475     ++OpOffset;
3476   }
3477 
3478   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3479 
3480   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3481   if (TotalOffset != 0)
3482     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3483 
3484   unsigned Opc;
3485   if (IsTyped) {
3486     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3487                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3488   } else if (IsFormat) {
3489     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3490                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3491   } else {
3492     switch (MemSize) {
3493     case 1:
3494       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3495       break;
3496     case 2:
3497       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3498       break;
3499     default:
3500       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3501       break;
3502     }
3503   }
3504 
3505   if (!VIndex)
3506     VIndex = B.buildConstant(S32, 0).getReg(0);
3507 
3508   auto MIB = B.buildInstr(Opc)
3509     .addUse(VData)              // vdata
3510     .addUse(RSrc)               // rsrc
3511     .addUse(VIndex)             // vindex
3512     .addUse(VOffset)            // voffset
3513     .addUse(SOffset)            // soffset
3514     .addImm(ImmOffset);         // offset(imm)
3515 
3516   if (IsTyped)
3517     MIB.addImm(Format);
3518 
3519   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3520      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3521      .addMemOperand(MMO);
3522 
3523   MI.eraseFromParent();
3524   return true;
3525 }
3526 
3527 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3528                                              MachineRegisterInfo &MRI,
3529                                              MachineIRBuilder &B,
3530                                              bool IsFormat,
3531                                              bool IsTyped) const {
3532   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3533   MachineMemOperand *MMO = *MI.memoperands_begin();
3534   const int MemSize = MMO->getSize();
3535   const LLT S32 = LLT::scalar(32);
3536 
3537   Register Dst = MI.getOperand(0).getReg();
3538   Register RSrc = MI.getOperand(2).getReg();
3539 
3540   // The typed intrinsics add an immediate after the registers.
3541   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3542 
3543   // The struct intrinsic variants add one additional operand over raw.
3544   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3545   Register VIndex;
3546   int OpOffset = 0;
3547   if (HasVIndex) {
3548     VIndex = MI.getOperand(3).getReg();
3549     OpOffset = 1;
3550   }
3551 
3552   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3553   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3554 
3555   unsigned Format = 0;
3556   if (IsTyped) {
3557     Format = MI.getOperand(5 + OpOffset).getImm();
3558     ++OpOffset;
3559   }
3560 
3561   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3562   unsigned ImmOffset;
3563   unsigned TotalOffset;
3564 
3565   LLT Ty = MRI.getType(Dst);
3566   LLT EltTy = Ty.getScalarType();
3567   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3568   const bool Unpacked = ST.hasUnpackedD16VMem();
3569 
3570   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3571   if (TotalOffset != 0)
3572     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3573 
3574   unsigned Opc;
3575 
3576   if (IsTyped) {
3577     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3578                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3579   } else if (IsFormat) {
3580     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3581                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3582   } else {
3583     switch (MemSize) {
3584     case 1:
3585       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3586       break;
3587     case 2:
3588       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3589       break;
3590     default:
3591       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3592       break;
3593     }
3594   }
3595 
3596   Register LoadDstReg;
3597 
3598   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3599   LLT UnpackedTy = Ty.changeElementSize(32);
3600 
3601   if (IsExtLoad)
3602     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3603   else if (Unpacked && IsD16 && Ty.isVector())
3604     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3605   else
3606     LoadDstReg = Dst;
3607 
3608   if (!VIndex)
3609     VIndex = B.buildConstant(S32, 0).getReg(0);
3610 
3611   auto MIB = B.buildInstr(Opc)
3612     .addDef(LoadDstReg)         // vdata
3613     .addUse(RSrc)               // rsrc
3614     .addUse(VIndex)             // vindex
3615     .addUse(VOffset)            // voffset
3616     .addUse(SOffset)            // soffset
3617     .addImm(ImmOffset);         // offset(imm)
3618 
3619   if (IsTyped)
3620     MIB.addImm(Format);
3621 
3622   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3623      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3624      .addMemOperand(MMO);
3625 
3626   if (LoadDstReg != Dst) {
3627     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3628 
3629     // Widen result for extending loads was widened.
3630     if (IsExtLoad)
3631       B.buildTrunc(Dst, LoadDstReg);
3632     else {
3633       // Repack to original 16-bit vector result
3634       // FIXME: G_TRUNC should work, but legalization currently fails
3635       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3636       SmallVector<Register, 4> Repack;
3637       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3638         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3639       B.buildMerge(Dst, Repack);
3640     }
3641   }
3642 
3643   MI.eraseFromParent();
3644   return true;
3645 }
3646 
3647 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3648                                                MachineIRBuilder &B,
3649                                                bool IsInc) const {
3650   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3651                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3652   B.buildInstr(Opc)
3653     .addDef(MI.getOperand(0).getReg())
3654     .addUse(MI.getOperand(2).getReg())
3655     .addUse(MI.getOperand(3).getReg())
3656     .cloneMemRefs(MI);
3657   MI.eraseFromParent();
3658   return true;
3659 }
3660 
3661 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3662   switch (IntrID) {
3663   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3664   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3665     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3666   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3667   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3668     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3669   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3670   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3671     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3672   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3673   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3674     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3675   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3676   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3677     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3678   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3679   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3680     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3681   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3682   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3683     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3684   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3685   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3686     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3687   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3688   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3689     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3690   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3691   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3692     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3693   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3694   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3695     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3696   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3697   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3698     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3699   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3700   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3701     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3702   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3703   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3704     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3705   default:
3706     llvm_unreachable("unhandled atomic opcode");
3707   }
3708 }
3709 
3710 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3711                                                MachineIRBuilder &B,
3712                                                Intrinsic::ID IID) const {
3713   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3714                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3715   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3716 
3717   Register Dst;
3718 
3719   int OpOffset = 0;
3720   if (HasReturn) {
3721     // A few FP atomics do not support return values.
3722     Dst = MI.getOperand(0).getReg();
3723   } else {
3724     OpOffset = -1;
3725   }
3726 
3727   Register VData = MI.getOperand(2 + OpOffset).getReg();
3728   Register CmpVal;
3729 
3730   if (IsCmpSwap) {
3731     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3732     ++OpOffset;
3733   }
3734 
3735   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3736   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3737 
3738   // The struct intrinsic variants add one additional operand over raw.
3739   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3740   Register VIndex;
3741   if (HasVIndex) {
3742     VIndex = MI.getOperand(4 + OpOffset).getReg();
3743     ++OpOffset;
3744   }
3745 
3746   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3747   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3748   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3749 
3750   MachineMemOperand *MMO = *MI.memoperands_begin();
3751 
3752   unsigned ImmOffset;
3753   unsigned TotalOffset;
3754   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3755   if (TotalOffset != 0)
3756     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3757 
3758   if (!VIndex)
3759     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3760 
3761   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3762 
3763   if (HasReturn)
3764     MIB.addDef(Dst);
3765 
3766   MIB.addUse(VData); // vdata
3767 
3768   if (IsCmpSwap)
3769     MIB.addReg(CmpVal);
3770 
3771   MIB.addUse(RSrc)               // rsrc
3772      .addUse(VIndex)             // vindex
3773      .addUse(VOffset)            // voffset
3774      .addUse(SOffset)            // soffset
3775      .addImm(ImmOffset)          // offset(imm)
3776      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3777      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3778      .addMemOperand(MMO);
3779 
3780   MI.eraseFromParent();
3781   return true;
3782 }
3783 
3784 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3785 /// vector with s16 typed elements.
3786 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3787                                         SmallVectorImpl<Register> &PackedAddrs,
3788                                         int AddrIdx, int DimIdx, int EndIdx,
3789                                         int NumGradients) {
3790   const LLT S16 = LLT::scalar(16);
3791   const LLT V2S16 = LLT::vector(2, 16);
3792 
3793   for (int I = AddrIdx; I < EndIdx; ++I) {
3794     MachineOperand &SrcOp = MI.getOperand(I);
3795     if (!SrcOp.isReg())
3796       continue; // _L to _LZ may have eliminated this.
3797 
3798     Register AddrReg = SrcOp.getReg();
3799 
3800     if (I < DimIdx) {
3801       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3802       PackedAddrs.push_back(AddrReg);
3803     } else {
3804       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3805       // derivatives dx/dh and dx/dv are packed with undef.
3806       if (((I + 1) >= EndIdx) ||
3807           ((NumGradients / 2) % 2 == 1 &&
3808            (I == DimIdx + (NumGradients / 2) - 1 ||
3809             I == DimIdx + NumGradients - 1)) ||
3810           // Check for _L to _LZ optimization
3811           !MI.getOperand(I + 1).isReg()) {
3812         PackedAddrs.push_back(
3813             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3814                 .getReg(0));
3815       } else {
3816         PackedAddrs.push_back(
3817             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3818                 .getReg(0));
3819         ++I;
3820       }
3821     }
3822   }
3823 }
3824 
3825 /// Convert from separate vaddr components to a single vector address register,
3826 /// and replace the remaining operands with $noreg.
3827 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3828                                      int DimIdx, int NumVAddrs) {
3829   const LLT S32 = LLT::scalar(32);
3830 
3831   SmallVector<Register, 8> AddrRegs;
3832   for (int I = 0; I != NumVAddrs; ++I) {
3833     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3834     if (SrcOp.isReg()) {
3835       AddrRegs.push_back(SrcOp.getReg());
3836       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3837     }
3838   }
3839 
3840   int NumAddrRegs = AddrRegs.size();
3841   if (NumAddrRegs != 1) {
3842     // Round up to 8 elements for v5-v7
3843     // FIXME: Missing intermediate sized register classes and instructions.
3844     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3845       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3846       auto Undef = B.buildUndef(S32);
3847       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3848       NumAddrRegs = RoundedNumRegs;
3849     }
3850 
3851     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3852     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3853   }
3854 
3855   for (int I = 1; I != NumVAddrs; ++I) {
3856     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3857     if (SrcOp.isReg())
3858       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3859   }
3860 }
3861 
3862 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3863 ///
3864 /// Depending on the subtarget, load/store with 16-bit element data need to be
3865 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3866 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3867 /// registers.
3868 ///
3869 /// We don't want to directly select image instructions just yet, but also want
3870 /// to exposes all register repacking to the legalizer/combiners. We also don't
3871 /// want a selected instrution entering RegBankSelect. In order to avoid
3872 /// defining a multitude of intermediate image instructions, directly hack on
3873 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3874 /// now unnecessary arguments with $noreg.
3875 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3876     MachineInstr &MI, MachineIRBuilder &B,
3877     GISelChangeObserver &Observer,
3878     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3879 
3880   const int NumDefs = MI.getNumExplicitDefs();
3881   bool IsTFE = NumDefs == 2;
3882   // We are only processing the operands of d16 image operations on subtargets
3883   // that use the unpacked register layout, or need to repack the TFE result.
3884 
3885   // TODO: Do we need to guard against already legalized intrinsics?
3886   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3887     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3888 
3889   MachineRegisterInfo *MRI = B.getMRI();
3890   const LLT S32 = LLT::scalar(32);
3891   const LLT S16 = LLT::scalar(16);
3892   const LLT V2S16 = LLT::vector(2, 16);
3893 
3894   // Index of first address argument
3895   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3896 
3897   int NumVAddrs, NumGradients;
3898   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3899   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3900     getDMaskIdx(BaseOpcode, NumDefs);
3901   unsigned DMask = 0;
3902 
3903   // Check for 16 bit addresses and pack if true.
3904   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3905   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3906   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3907   const bool IsG16 = GradTy == S16;
3908   const bool IsA16 = AddrTy == S16;
3909 
3910   int DMaskLanes = 0;
3911   if (!BaseOpcode->Atomic) {
3912     DMask = MI.getOperand(DMaskIdx).getImm();
3913     if (BaseOpcode->Gather4) {
3914       DMaskLanes = 4;
3915     } else if (DMask != 0) {
3916       DMaskLanes = countPopulation(DMask);
3917     } else if (!IsTFE && !BaseOpcode->Store) {
3918       // If dmask is 0, this is a no-op load. This can be eliminated.
3919       B.buildUndef(MI.getOperand(0));
3920       MI.eraseFromParent();
3921       return true;
3922     }
3923   }
3924 
3925   Observer.changingInstr(MI);
3926   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3927 
3928   unsigned NewOpcode = NumDefs == 0 ?
3929     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3930 
3931   // Track that we legalized this
3932   MI.setDesc(B.getTII().get(NewOpcode));
3933 
3934   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3935   // dmask to be at least 1 otherwise the instruction will fail
3936   if (IsTFE && DMask == 0) {
3937     DMask = 0x1;
3938     DMaskLanes = 1;
3939     MI.getOperand(DMaskIdx).setImm(DMask);
3940   }
3941 
3942   if (BaseOpcode->Atomic) {
3943     Register VData0 = MI.getOperand(2).getReg();
3944     LLT Ty = MRI->getType(VData0);
3945 
3946     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3947     if (Ty.isVector())
3948       return false;
3949 
3950     if (BaseOpcode->AtomicX2) {
3951       Register VData1 = MI.getOperand(3).getReg();
3952       // The two values are packed in one register.
3953       LLT PackedTy = LLT::vector(2, Ty);
3954       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3955       MI.getOperand(2).setReg(Concat.getReg(0));
3956       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3957     }
3958   }
3959 
3960   int CorrectedNumVAddrs = NumVAddrs;
3961 
3962   // Optimize _L to _LZ when _L is zero
3963   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3964         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3965     const ConstantFP *ConstantLod;
3966     const int LodIdx = AddrIdx + NumVAddrs - 1;
3967 
3968     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3969       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3970         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3971         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3972           LZMappingInfo->LZ, ImageDimIntr->Dim);
3973 
3974         // The starting indexes should remain in the same place.
3975         --NumVAddrs;
3976         --CorrectedNumVAddrs;
3977 
3978         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3979           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3980         MI.RemoveOperand(LodIdx);
3981       }
3982     }
3983   }
3984 
3985   // Optimize _mip away, when 'lod' is zero
3986   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3987     int64_t ConstantLod;
3988     const int LodIdx = AddrIdx + NumVAddrs - 1;
3989 
3990     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3991       if (ConstantLod == 0) {
3992         // TODO: Change intrinsic opcode and remove operand instead or replacing
3993         // it with 0, as the _L to _LZ handling is done above.
3994         MI.getOperand(LodIdx).ChangeToImmediate(0);
3995         --CorrectedNumVAddrs;
3996       }
3997     }
3998   }
3999 
4000   // Rewrite the addressing register layout before doing anything else.
4001   if (IsA16 || IsG16) {
4002     if (IsA16) {
4003       // Target must support the feature and gradients need to be 16 bit too
4004       if (!ST.hasA16() || !IsG16)
4005         return false;
4006     } else if (!ST.hasG16())
4007       return false;
4008 
4009     if (NumVAddrs > 1) {
4010       SmallVector<Register, 4> PackedRegs;
4011       // Don't compress addresses for G16
4012       const int PackEndIdx =
4013           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
4014       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
4015                                   PackEndIdx, NumGradients);
4016 
4017       if (!IsA16) {
4018         // Add uncompressed address
4019         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
4020           int AddrReg = MI.getOperand(I).getReg();
4021           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4022           PackedRegs.push_back(AddrReg);
4023         }
4024       }
4025 
4026       // See also below in the non-a16 branch
4027       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4028 
4029       if (!UseNSA && PackedRegs.size() > 1) {
4030         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4031         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4032         PackedRegs[0] = Concat.getReg(0);
4033         PackedRegs.resize(1);
4034       }
4035 
4036       const int NumPacked = PackedRegs.size();
4037       for (int I = 0; I != NumVAddrs; ++I) {
4038         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
4039         if (!SrcOp.isReg()) {
4040           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4041           continue;
4042         }
4043 
4044         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4045 
4046         if (I < NumPacked)
4047           SrcOp.setReg(PackedRegs[I]);
4048         else
4049           SrcOp.setReg(AMDGPU::NoRegister);
4050       }
4051     }
4052   } else {
4053     // If the register allocator cannot place the address registers contiguously
4054     // without introducing moves, then using the non-sequential address encoding
4055     // is always preferable, since it saves VALU instructions and is usually a
4056     // wash in terms of code size or even better.
4057     //
4058     // However, we currently have no way of hinting to the register allocator
4059     // that MIMG addresses should be placed contiguously when it is possible to
4060     // do so, so force non-NSA for the common 2-address case as a heuristic.
4061     //
4062     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4063     // allocation when possible.
4064     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4065 
4066     if (!UseNSA && NumVAddrs > 1)
4067       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
4068   }
4069 
4070   int Flags = 0;
4071   if (IsA16)
4072     Flags |= 1;
4073   if (IsG16)
4074     Flags |= 2;
4075   MI.addOperand(MachineOperand::CreateImm(Flags));
4076 
4077   if (BaseOpcode->Store) { // No TFE for stores?
4078     // TODO: Handle dmask trim
4079     Register VData = MI.getOperand(1).getReg();
4080     LLT Ty = MRI->getType(VData);
4081     if (!Ty.isVector() || Ty.getElementType() != S16)
4082       return true;
4083 
4084     Register RepackedReg = handleD16VData(B, *MRI, VData);
4085     if (RepackedReg != VData) {
4086       MI.getOperand(1).setReg(RepackedReg);
4087     }
4088 
4089     return true;
4090   }
4091 
4092   Register DstReg = MI.getOperand(0).getReg();
4093   LLT Ty = MRI->getType(DstReg);
4094   const LLT EltTy = Ty.getScalarType();
4095   const bool IsD16 = Ty.getScalarType() == S16;
4096   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4097 
4098   // Confirm that the return type is large enough for the dmask specified
4099   if (NumElts < DMaskLanes)
4100     return false;
4101 
4102   if (NumElts > 4 || DMaskLanes > 4)
4103     return false;
4104 
4105   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4106   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4107 
4108   // The raw dword aligned data component of the load. The only legal cases
4109   // where this matters should be when using the packed D16 format, for
4110   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4111   LLT RoundedTy;
4112 
4113   // S32 vector to to cover all data, plus TFE result element.
4114   LLT TFETy;
4115 
4116   // Register type to use for each loaded component. Will be S32 or V2S16.
4117   LLT RegTy;
4118 
4119   if (IsD16 && ST.hasUnpackedD16VMem()) {
4120     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4121     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4122     RegTy = S32;
4123   } else {
4124     unsigned EltSize = EltTy.getSizeInBits();
4125     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4126     unsigned RoundedSize = 32 * RoundedElts;
4127     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4128     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4129     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4130   }
4131 
4132   // The return type does not need adjustment.
4133   // TODO: Should we change s16 case to s32 or <2 x s16>?
4134   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4135     return true;
4136 
4137   Register Dst1Reg;
4138 
4139   // Insert after the instruction.
4140   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4141 
4142   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4143   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4144   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4145   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4146 
4147   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4148 
4149   MI.getOperand(0).setReg(NewResultReg);
4150 
4151   // In the IR, TFE is supposed to be used with a 2 element struct return
4152   // type. The intruction really returns these two values in one contiguous
4153   // register, with one additional dword beyond the loaded data. Rewrite the
4154   // return type to use a single register result.
4155 
4156   if (IsTFE) {
4157     Dst1Reg = MI.getOperand(1).getReg();
4158     if (MRI->getType(Dst1Reg) != S32)
4159       return false;
4160 
4161     // TODO: Make sure the TFE operand bit is set.
4162     MI.RemoveOperand(1);
4163 
4164     // Handle the easy case that requires no repack instructions.
4165     if (Ty == S32) {
4166       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4167       return true;
4168     }
4169   }
4170 
4171   // Now figure out how to copy the new result register back into the old
4172   // result.
4173   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4174 
4175   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4176 
4177   if (ResultNumRegs == 1) {
4178     assert(!IsTFE);
4179     ResultRegs[0] = NewResultReg;
4180   } else {
4181     // We have to repack into a new vector of some kind.
4182     for (int I = 0; I != NumDataRegs; ++I)
4183       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4184     B.buildUnmerge(ResultRegs, NewResultReg);
4185 
4186     // Drop the final TFE element to get the data part. The TFE result is
4187     // directly written to the right place already.
4188     if (IsTFE)
4189       ResultRegs.resize(NumDataRegs);
4190   }
4191 
4192   // For an s16 scalar result, we form an s32 result with a truncate regardless
4193   // of packed vs. unpacked.
4194   if (IsD16 && !Ty.isVector()) {
4195     B.buildTrunc(DstReg, ResultRegs[0]);
4196     return true;
4197   }
4198 
4199   // Avoid a build/concat_vector of 1 entry.
4200   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4201     B.buildBitcast(DstReg, ResultRegs[0]);
4202     return true;
4203   }
4204 
4205   assert(Ty.isVector());
4206 
4207   if (IsD16) {
4208     // For packed D16 results with TFE enabled, all the data components are
4209     // S32. Cast back to the expected type.
4210     //
4211     // TODO: We don't really need to use load s32 elements. We would only need one
4212     // cast for the TFE result if a multiple of v2s16 was used.
4213     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4214       for (Register &Reg : ResultRegs)
4215         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4216     } else if (ST.hasUnpackedD16VMem()) {
4217       for (Register &Reg : ResultRegs)
4218         Reg = B.buildTrunc(S16, Reg).getReg(0);
4219     }
4220   }
4221 
4222   auto padWithUndef = [&](LLT Ty, int NumElts) {
4223     if (NumElts == 0)
4224       return;
4225     Register Undef = B.buildUndef(Ty).getReg(0);
4226     for (int I = 0; I != NumElts; ++I)
4227       ResultRegs.push_back(Undef);
4228   };
4229 
4230   // Pad out any elements eliminated due to the dmask.
4231   LLT ResTy = MRI->getType(ResultRegs[0]);
4232   if (!ResTy.isVector()) {
4233     padWithUndef(ResTy, NumElts - ResultRegs.size());
4234     B.buildBuildVector(DstReg, ResultRegs);
4235     return true;
4236   }
4237 
4238   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4239   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4240 
4241   // Deal with the one annoying legal case.
4242   const LLT V3S16 = LLT::vector(3, 16);
4243   if (Ty == V3S16) {
4244     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4245     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4246     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4247     return true;
4248   }
4249 
4250   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4251   B.buildConcatVectors(DstReg, ResultRegs);
4252   return true;
4253 }
4254 
4255 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4256   LegalizerHelper &Helper, MachineInstr &MI) const {
4257   MachineIRBuilder &B = Helper.MIRBuilder;
4258   GISelChangeObserver &Observer = Helper.Observer;
4259 
4260   Register Dst = MI.getOperand(0).getReg();
4261   LLT Ty = B.getMRI()->getType(Dst);
4262   unsigned Size = Ty.getSizeInBits();
4263   MachineFunction &MF = B.getMF();
4264 
4265   Observer.changingInstr(MI);
4266 
4267   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4268     Ty = getBitcastRegisterType(Ty);
4269     Helper.bitcastDst(MI, Ty, 0);
4270     Dst = MI.getOperand(0).getReg();
4271     B.setInsertPt(B.getMBB(), MI);
4272   }
4273 
4274   // FIXME: We don't really need this intermediate instruction. The intrinsic
4275   // should be fixed to have a memory operand. Since it's readnone, we're not
4276   // allowed to add one.
4277   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4278   MI.RemoveOperand(1); // Remove intrinsic ID
4279 
4280   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4281   // TODO: Should this use datalayout alignment?
4282   const unsigned MemSize = (Size + 7) / 8;
4283   const Align MemAlign(4);
4284   MachineMemOperand *MMO = MF.getMachineMemOperand(
4285       MachinePointerInfo(),
4286       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4287           MachineMemOperand::MOInvariant,
4288       MemSize, MemAlign);
4289   MI.addMemOperand(MF, MMO);
4290 
4291   // There are no 96-bit result scalar loads, but widening to 128-bit should
4292   // always be legal. We may need to restore this to a 96-bit result if it turns
4293   // out this needs to be converted to a vector load during RegBankSelect.
4294   if (!isPowerOf2_32(Size)) {
4295     if (Ty.isVector())
4296       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4297     else
4298       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4299   }
4300 
4301   Observer.changedInstr(MI);
4302   return true;
4303 }
4304 
4305 // TODO: Move to selection
4306 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4307                                                 MachineRegisterInfo &MRI,
4308                                                 MachineIRBuilder &B) const {
4309   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4310   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4311       !ST.isTrapHandlerEnabled()) {
4312     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4313   } else {
4314     // Pass queue pointer to trap handler as input, and insert trap instruction
4315     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4316     MachineRegisterInfo &MRI = *B.getMRI();
4317 
4318     Register LiveIn =
4319       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4320     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4321       return false;
4322 
4323     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4324     B.buildCopy(SGPR01, LiveIn);
4325     B.buildInstr(AMDGPU::S_TRAP)
4326         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4327         .addReg(SGPR01, RegState::Implicit);
4328   }
4329 
4330   MI.eraseFromParent();
4331   return true;
4332 }
4333 
4334 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4335     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4336   // Is non-HSA path or trap-handler disabled? then, report a warning
4337   // accordingly
4338   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4339       !ST.isTrapHandlerEnabled()) {
4340     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4341                                      "debugtrap handler not supported",
4342                                      MI.getDebugLoc(), DS_Warning);
4343     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4344     Ctx.diagnose(NoTrap);
4345   } else {
4346     // Insert debug-trap instruction
4347     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4348   }
4349 
4350   MI.eraseFromParent();
4351   return true;
4352 }
4353 
4354 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4355                                             MachineInstr &MI) const {
4356   MachineIRBuilder &B = Helper.MIRBuilder;
4357   MachineRegisterInfo &MRI = *B.getMRI();
4358 
4359   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4360   auto IntrID = MI.getIntrinsicID();
4361   switch (IntrID) {
4362   case Intrinsic::amdgcn_if:
4363   case Intrinsic::amdgcn_else: {
4364     MachineInstr *Br = nullptr;
4365     MachineBasicBlock *UncondBrTarget = nullptr;
4366     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4367       const SIRegisterInfo *TRI
4368         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4369 
4370       Register Def = MI.getOperand(1).getReg();
4371       Register Use = MI.getOperand(3).getReg();
4372 
4373       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4374       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4375       if (IntrID == Intrinsic::amdgcn_if) {
4376         B.buildInstr(AMDGPU::SI_IF)
4377           .addDef(Def)
4378           .addUse(Use)
4379           .addMBB(UncondBrTarget);
4380       } else {
4381         B.buildInstr(AMDGPU::SI_ELSE)
4382           .addDef(Def)
4383           .addUse(Use)
4384           .addMBB(UncondBrTarget)
4385           .addImm(0);
4386       }
4387 
4388       if (Br) {
4389         Br->getOperand(0).setMBB(CondBrTarget);
4390       } else {
4391         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4392         // since we're swapping branch targets it needs to be reinserted.
4393         // FIXME: IRTranslator should probably not do this
4394         B.buildBr(*CondBrTarget);
4395       }
4396 
4397       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4398       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4399       MI.eraseFromParent();
4400       BrCond->eraseFromParent();
4401       return true;
4402     }
4403 
4404     return false;
4405   }
4406   case Intrinsic::amdgcn_loop: {
4407     MachineInstr *Br = nullptr;
4408     MachineBasicBlock *UncondBrTarget = nullptr;
4409     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4410       const SIRegisterInfo *TRI
4411         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4412 
4413       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4414       Register Reg = MI.getOperand(2).getReg();
4415 
4416       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4417       B.buildInstr(AMDGPU::SI_LOOP)
4418         .addUse(Reg)
4419         .addMBB(UncondBrTarget);
4420 
4421       if (Br)
4422         Br->getOperand(0).setMBB(CondBrTarget);
4423       else
4424         B.buildBr(*CondBrTarget);
4425 
4426       MI.eraseFromParent();
4427       BrCond->eraseFromParent();
4428       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4429       return true;
4430     }
4431 
4432     return false;
4433   }
4434   case Intrinsic::amdgcn_kernarg_segment_ptr:
4435     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4436       // This only makes sense to call in a kernel, so just lower to null.
4437       B.buildConstant(MI.getOperand(0).getReg(), 0);
4438       MI.eraseFromParent();
4439       return true;
4440     }
4441 
4442     return legalizePreloadedArgIntrin(
4443       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4444   case Intrinsic::amdgcn_implicitarg_ptr:
4445     return legalizeImplicitArgPtr(MI, MRI, B);
4446   case Intrinsic::amdgcn_workitem_id_x:
4447     return legalizePreloadedArgIntrin(MI, MRI, B,
4448                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4449   case Intrinsic::amdgcn_workitem_id_y:
4450     return legalizePreloadedArgIntrin(MI, MRI, B,
4451                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4452   case Intrinsic::amdgcn_workitem_id_z:
4453     return legalizePreloadedArgIntrin(MI, MRI, B,
4454                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4455   case Intrinsic::amdgcn_workgroup_id_x:
4456     return legalizePreloadedArgIntrin(MI, MRI, B,
4457                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4458   case Intrinsic::amdgcn_workgroup_id_y:
4459     return legalizePreloadedArgIntrin(MI, MRI, B,
4460                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4461   case Intrinsic::amdgcn_workgroup_id_z:
4462     return legalizePreloadedArgIntrin(MI, MRI, B,
4463                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4464   case Intrinsic::amdgcn_dispatch_ptr:
4465     return legalizePreloadedArgIntrin(MI, MRI, B,
4466                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4467   case Intrinsic::amdgcn_queue_ptr:
4468     return legalizePreloadedArgIntrin(MI, MRI, B,
4469                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4470   case Intrinsic::amdgcn_implicit_buffer_ptr:
4471     return legalizePreloadedArgIntrin(
4472       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4473   case Intrinsic::amdgcn_dispatch_id:
4474     return legalizePreloadedArgIntrin(MI, MRI, B,
4475                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4476   case Intrinsic::amdgcn_fdiv_fast:
4477     return legalizeFDIVFastIntrin(MI, MRI, B);
4478   case Intrinsic::amdgcn_is_shared:
4479     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4480   case Intrinsic::amdgcn_is_private:
4481     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4482   case Intrinsic::amdgcn_wavefrontsize: {
4483     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4484     MI.eraseFromParent();
4485     return true;
4486   }
4487   case Intrinsic::amdgcn_s_buffer_load:
4488     return legalizeSBufferLoad(Helper, MI);
4489   case Intrinsic::amdgcn_raw_buffer_store:
4490   case Intrinsic::amdgcn_struct_buffer_store:
4491     return legalizeBufferStore(MI, MRI, B, false, false);
4492   case Intrinsic::amdgcn_raw_buffer_store_format:
4493   case Intrinsic::amdgcn_struct_buffer_store_format:
4494     return legalizeBufferStore(MI, MRI, B, false, true);
4495   case Intrinsic::amdgcn_raw_tbuffer_store:
4496   case Intrinsic::amdgcn_struct_tbuffer_store:
4497     return legalizeBufferStore(MI, MRI, B, true, true);
4498   case Intrinsic::amdgcn_raw_buffer_load:
4499   case Intrinsic::amdgcn_struct_buffer_load:
4500     return legalizeBufferLoad(MI, MRI, B, false, false);
4501   case Intrinsic::amdgcn_raw_buffer_load_format:
4502   case Intrinsic::amdgcn_struct_buffer_load_format:
4503     return legalizeBufferLoad(MI, MRI, B, true, false);
4504   case Intrinsic::amdgcn_raw_tbuffer_load:
4505   case Intrinsic::amdgcn_struct_tbuffer_load:
4506     return legalizeBufferLoad(MI, MRI, B, true, true);
4507   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4508   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4509   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4510   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4511   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4512   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4513   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4514   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4515   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4516   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4517   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4518   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4519   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4520   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4521   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4522   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4523   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4524   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4525   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4526   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4527   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4528   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4529   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4530   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4531   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4532   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4533   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4534   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4535     return legalizeBufferAtomic(MI, B, IntrID);
4536   case Intrinsic::amdgcn_atomic_inc:
4537     return legalizeAtomicIncDec(MI, B, true);
4538   case Intrinsic::amdgcn_atomic_dec:
4539     return legalizeAtomicIncDec(MI, B, false);
4540   case Intrinsic::trap:
4541     return legalizeTrapIntrinsic(MI, MRI, B);
4542   case Intrinsic::debugtrap:
4543     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4544   case Intrinsic::amdgcn_rsq_clamp:
4545     return legalizeRsqClampIntrinsic(MI, MRI, B);
4546   case Intrinsic::amdgcn_ds_fadd:
4547   case Intrinsic::amdgcn_ds_fmin:
4548   case Intrinsic::amdgcn_ds_fmax:
4549     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4550   default: {
4551     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4552             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4553       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4554     return true;
4555   }
4556   }
4557 
4558   return true;
4559 }
4560