1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     return Ty.getSizeInBits() % 32 == 0;
77   };
78 }
79 
80 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getScalarType();
84     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
85   };
86 }
87 
88 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
93   };
94 }
95 
96 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     unsigned Size = Ty.getSizeInBits();
101     unsigned Pieces = (Size + 63) / 64;
102     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
103     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
104   };
105 }
106 
107 // Increase the number of vector elements to reach the next multiple of 32-bit
108 // type.
109 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
110   return [=](const LegalityQuery &Query) {
111     const LLT Ty = Query.Types[TypeIdx];
112 
113     const LLT EltTy = Ty.getElementType();
114     const int Size = Ty.getSizeInBits();
115     const int EltSize = EltTy.getSizeInBits();
116     const int NextMul32 = (Size + 31) / 32;
117 
118     assert(EltSize < 32);
119 
120     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
121     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
122   };
123 }
124 
125 static LLT getBitcastRegisterType(const LLT Ty) {
126   const unsigned Size = Ty.getSizeInBits();
127 
128   LLT CoercedTy;
129   if (Size <= 32) {
130     // <2 x s8> -> s16
131     // <4 x s8> -> s32
132     return LLT::scalar(Size);
133   }
134 
135   return LLT::scalarOrVector(Size / 32, 32);
136 }
137 
138 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT Ty = Query.Types[TypeIdx];
141     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
142   };
143 }
144 
145 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
146   return [=](const LegalityQuery &Query) {
147     const LLT Ty = Query.Types[TypeIdx];
148     unsigned Size = Ty.getSizeInBits();
149     assert(Size % 32 == 0);
150     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
151   };
152 }
153 
154 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
155   return [=](const LegalityQuery &Query) {
156     const LLT QueryTy = Query.Types[TypeIdx];
157     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
158   };
159 }
160 
161 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
162   return [=](const LegalityQuery &Query) {
163     const LLT QueryTy = Query.Types[TypeIdx];
164     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
165   };
166 }
167 
168 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
169   return [=](const LegalityQuery &Query) {
170     const LLT QueryTy = Query.Types[TypeIdx];
171     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
172   };
173 }
174 
175 static bool isRegisterSize(unsigned Size) {
176   return Size % 32 == 0 && Size <= MaxRegisterSize;
177 }
178 
179 static bool isRegisterVectorElementType(LLT EltTy) {
180   const int EltSize = EltTy.getSizeInBits();
181   return EltSize == 16 || EltSize % 32 == 0;
182 }
183 
184 static bool isRegisterVectorType(LLT Ty) {
185   const int EltSize = Ty.getElementType().getSizeInBits();
186   return EltSize == 32 || EltSize == 64 ||
187          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
188          EltSize == 128 || EltSize == 256;
189 }
190 
191 static bool isRegisterType(LLT Ty) {
192   if (!isRegisterSize(Ty.getSizeInBits()))
193     return false;
194 
195   if (Ty.isVector())
196     return isRegisterVectorType(Ty);
197 
198   return true;
199 }
200 
201 // Any combination of 32 or 64-bit elements up the maximum register size, and
202 // multiples of v2s16.
203 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
204   return [=](const LegalityQuery &Query) {
205     return isRegisterType(Query.Types[TypeIdx]);
206   };
207 }
208 
209 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
210   return [=](const LegalityQuery &Query) {
211     const LLT QueryTy = Query.Types[TypeIdx];
212     if (!QueryTy.isVector())
213       return false;
214     const LLT EltTy = QueryTy.getElementType();
215     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
216   };
217 }
218 
219 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT Ty = Query.Types[TypeIdx];
222     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
223            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
224   };
225 }
226 
227 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
228 // handle some operations by just promoting the register during
229 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
230 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
231                                     bool IsLoad) {
232   switch (AS) {
233   case AMDGPUAS::PRIVATE_ADDRESS:
234     // FIXME: Private element size.
235     return 32;
236   case AMDGPUAS::LOCAL_ADDRESS:
237     return ST.useDS128() ? 128 : 64;
238   case AMDGPUAS::GLOBAL_ADDRESS:
239   case AMDGPUAS::CONSTANT_ADDRESS:
240   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
241     // Treat constant and global as identical. SMRD loads are sometimes usable for
242     // global loads (ideally constant address space should be eliminated)
243     // depending on the context. Legality cannot be context dependent, but
244     // RegBankSelect can split the load as necessary depending on the pointer
245     // register bank/uniformity and if the memory is invariant or not written in a
246     // kernel.
247     return IsLoad ? 512 : 128;
248   default:
249     // Flat addresses may contextually need to be split to 32-bit parts if they
250     // may alias scratch depending on the subtarget.
251     return 128;
252   }
253 }
254 
255 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
256                                  const LegalityQuery &Query,
257                                  unsigned Opcode) {
258   const LLT Ty = Query.Types[0];
259 
260   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
261   const bool IsLoad = Opcode != AMDGPU::G_STORE;
262 
263   unsigned RegSize = Ty.getSizeInBits();
264   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
265   unsigned Align = Query.MMODescrs[0].AlignInBits;
266   unsigned AS = Query.Types[1].getAddressSpace();
267 
268   // All of these need to be custom lowered to cast the pointer operand.
269   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
270     return false;
271 
272   // TODO: We should be able to widen loads if the alignment is high enough, but
273   // we also need to modify the memory access size.
274 #if 0
275   // Accept widening loads based on alignment.
276   if (IsLoad && MemSize < Size)
277     MemSize = std::max(MemSize, Align);
278 #endif
279 
280   // Only 1-byte and 2-byte to 32-bit extloads are valid.
281   if (MemSize != RegSize && RegSize != 32)
282     return false;
283 
284   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
285     return false;
286 
287   switch (MemSize) {
288   case 8:
289   case 16:
290   case 32:
291   case 64:
292   case 128:
293     break;
294   case 96:
295     if (!ST.hasDwordx3LoadStores())
296       return false;
297     break;
298   case 256:
299   case 512:
300     // These may contextually need to be broken down.
301     break;
302   default:
303     return false;
304   }
305 
306   assert(RegSize >= MemSize);
307 
308   if (Align < MemSize) {
309     const SITargetLowering *TLI = ST.getTargetLowering();
310     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
311       return false;
312   }
313 
314   return true;
315 }
316 
317 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
318 // workaround this. Eventually it should ignore the type for loads and only care
319 // about the size. Return true in cases where we will workaround this for now by
320 // bitcasting.
321 static bool loadStoreBitcastWorkaround(const LLT Ty) {
322   if (EnableNewLegality)
323     return false;
324 
325   const unsigned Size = Ty.getSizeInBits();
326   if (Size <= 64)
327     return false;
328   if (!Ty.isVector())
329     return true;
330   unsigned EltSize = Ty.getElementType().getSizeInBits();
331   return EltSize != 32 && EltSize != 64;
332 }
333 
334 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
335                              unsigned Opcode) {
336   const LLT Ty = Query.Types[0];
337   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
338          !loadStoreBitcastWorkaround(Ty);
339 }
340 
341 /// Return true if a load or store of the type should be lowered with a bitcast
342 /// to a different type.
343 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
344                                        const unsigned MemSizeInBits) {
345   const unsigned Size = Ty.getSizeInBits();
346     if (Size != MemSizeInBits)
347       return Size <= 32 && Ty.isVector();
348 
349   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
350     return true;
351   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
352          !isRegisterVectorElementType(Ty.getElementType());
353 }
354 
355 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
356                                          const GCNTargetMachine &TM)
357   :  ST(ST_) {
358   using namespace TargetOpcode;
359 
360   auto GetAddrSpacePtr = [&TM](unsigned AS) {
361     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
362   };
363 
364   const LLT S1 = LLT::scalar(1);
365   const LLT S16 = LLT::scalar(16);
366   const LLT S32 = LLT::scalar(32);
367   const LLT S64 = LLT::scalar(64);
368   const LLT S128 = LLT::scalar(128);
369   const LLT S256 = LLT::scalar(256);
370   const LLT S512 = LLT::scalar(512);
371   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
372 
373   const LLT V2S16 = LLT::vector(2, 16);
374   const LLT V4S16 = LLT::vector(4, 16);
375 
376   const LLT V2S32 = LLT::vector(2, 32);
377   const LLT V3S32 = LLT::vector(3, 32);
378   const LLT V4S32 = LLT::vector(4, 32);
379   const LLT V5S32 = LLT::vector(5, 32);
380   const LLT V6S32 = LLT::vector(6, 32);
381   const LLT V7S32 = LLT::vector(7, 32);
382   const LLT V8S32 = LLT::vector(8, 32);
383   const LLT V9S32 = LLT::vector(9, 32);
384   const LLT V10S32 = LLT::vector(10, 32);
385   const LLT V11S32 = LLT::vector(11, 32);
386   const LLT V12S32 = LLT::vector(12, 32);
387   const LLT V13S32 = LLT::vector(13, 32);
388   const LLT V14S32 = LLT::vector(14, 32);
389   const LLT V15S32 = LLT::vector(15, 32);
390   const LLT V16S32 = LLT::vector(16, 32);
391   const LLT V32S32 = LLT::vector(32, 32);
392 
393   const LLT V2S64 = LLT::vector(2, 64);
394   const LLT V3S64 = LLT::vector(3, 64);
395   const LLT V4S64 = LLT::vector(4, 64);
396   const LLT V5S64 = LLT::vector(5, 64);
397   const LLT V6S64 = LLT::vector(6, 64);
398   const LLT V7S64 = LLT::vector(7, 64);
399   const LLT V8S64 = LLT::vector(8, 64);
400   const LLT V16S64 = LLT::vector(16, 64);
401 
402   std::initializer_list<LLT> AllS32Vectors =
403     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
404      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
405   std::initializer_list<LLT> AllS64Vectors =
406     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
407 
408   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
409   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
410   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
411   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
412   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
413   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
414   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
415 
416   const LLT CodePtr = FlatPtr;
417 
418   const std::initializer_list<LLT> AddrSpaces64 = {
419     GlobalPtr, ConstantPtr, FlatPtr
420   };
421 
422   const std::initializer_list<LLT> AddrSpaces32 = {
423     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
424   };
425 
426   const std::initializer_list<LLT> FPTypesBase = {
427     S32, S64
428   };
429 
430   const std::initializer_list<LLT> FPTypes16 = {
431     S32, S64, S16
432   };
433 
434   const std::initializer_list<LLT> FPTypesPK16 = {
435     S32, S64, S16, V2S16
436   };
437 
438   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
439 
440   setAction({G_BRCOND, S1}, Legal); // VCC branches
441   setAction({G_BRCOND, S32}, Legal); // SCC branches
442 
443   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
444   // elements for v3s16
445   getActionDefinitionsBuilder(G_PHI)
446     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
447     .legalFor(AllS32Vectors)
448     .legalFor(AllS64Vectors)
449     .legalFor(AddrSpaces64)
450     .legalFor(AddrSpaces32)
451     .legalIf(isPointer(0))
452     .clampScalar(0, S32, S256)
453     .widenScalarToNextPow2(0, 32)
454     .clampMaxNumElements(0, S32, 16)
455     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
456     .scalarize(0);
457 
458   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
459     // Full set of gfx9 features.
460     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
461       .legalFor({S32, S16, V2S16})
462       .clampScalar(0, S16, S32)
463       .clampMaxNumElements(0, S16, 2)
464       .scalarize(0)
465       .widenScalarToNextPow2(0, 32);
466 
467     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
468       .legalFor({S32, S16, V2S16}) // Clamp modifier
469       .minScalar(0, S16)
470       .clampMaxNumElements(0, S16, 2)
471       .scalarize(0)
472       .widenScalarToNextPow2(0, 32)
473       .lower();
474   } else if (ST.has16BitInsts()) {
475     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
476       .legalFor({S32, S16})
477       .clampScalar(0, S16, S32)
478       .scalarize(0)
479       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
480 
481     // Technically the saturating operations require clamp bit support, but this
482     // was introduced at the same time as 16-bit operations.
483     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
484       .legalFor({S32, S16}) // Clamp modifier
485       .minScalar(0, S16)
486       .scalarize(0)
487       .widenScalarToNextPow2(0, 16)
488       .lower();
489 
490     // We're just lowering this, but it helps get a better result to try to
491     // coerce to the desired type first.
492     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
493       .minScalar(0, S16)
494       .scalarize(0)
495       .lower();
496   } else {
497     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
498       .legalFor({S32})
499       .clampScalar(0, S32, S32)
500       .scalarize(0);
501 
502     if (ST.hasIntClamp()) {
503       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
504         .legalFor({S32}) // Clamp modifier.
505         .scalarize(0)
506         .minScalarOrElt(0, S32)
507         .lower();
508     } else {
509       // Clamp bit support was added in VI, along with 16-bit operations.
510       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
511         .minScalar(0, S32)
512         .scalarize(0)
513         .lower();
514     }
515 
516     // FIXME: DAG expansion gets better results. The widening uses the smaller
517     // range values and goes for the min/max lowering directly.
518     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
519       .minScalar(0, S32)
520       .scalarize(0)
521       .lower();
522   }
523 
524   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
525     .customFor({S32, S64})
526     .clampScalar(0, S32, S64)
527     .widenScalarToNextPow2(0, 32)
528     .scalarize(0);
529 
530   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
531     .legalFor({S32})
532     .clampScalar(0, S32, S32)
533     .scalarize(0);
534 
535   // Report legal for any types we can handle anywhere. For the cases only legal
536   // on the SALU, RegBankSelect will be able to re-legalize.
537   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
538     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
539     .clampScalar(0, S32, S64)
540     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
542     .widenScalarToNextPow2(0)
543     .scalarize(0);
544 
545   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
546                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
547     .legalFor({{S32, S1}, {S32, S32}})
548     .minScalar(0, S32)
549     // TODO: .scalarize(0)
550     .lower();
551 
552   getActionDefinitionsBuilder(G_BITCAST)
553     // Don't worry about the size constraint.
554     .legalIf(all(isRegisterType(0), isRegisterType(1)))
555     .lower();
556 
557 
558   getActionDefinitionsBuilder(G_CONSTANT)
559     .legalFor({S1, S32, S64, S16, GlobalPtr,
560                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
561     .legalIf(isPointer(0))
562     .clampScalar(0, S32, S64)
563     .widenScalarToNextPow2(0);
564 
565   getActionDefinitionsBuilder(G_FCONSTANT)
566     .legalFor({S32, S64, S16})
567     .clampScalar(0, S16, S64);
568 
569   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
570       .legalIf(isRegisterType(0))
571       // s1 and s16 are special cases because they have legal operations on
572       // them, but don't really occupy registers in the normal way.
573       .legalFor({S1, S16})
574       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
575       .clampScalarOrElt(0, S32, MaxScalar)
576       .widenScalarToNextPow2(0, 32)
577       .clampMaxNumElements(0, S32, 16);
578 
579   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
580 
581   // If the amount is divergent, we have to do a wave reduction to get the
582   // maximum value, so this is expanded during RegBankSelect.
583   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
584     .legalFor({{PrivatePtr, S32}});
585 
586   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
587     .customIf(typeIsNot(0, PrivatePtr));
588 
589   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
590 
591   auto &FPOpActions = getActionDefinitionsBuilder(
592     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
593     .legalFor({S32, S64});
594   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
595     .customFor({S32, S64});
596   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
597     .customFor({S32, S64});
598 
599   if (ST.has16BitInsts()) {
600     if (ST.hasVOP3PInsts())
601       FPOpActions.legalFor({S16, V2S16});
602     else
603       FPOpActions.legalFor({S16});
604 
605     TrigActions.customFor({S16});
606     FDIVActions.customFor({S16});
607   }
608 
609   auto &MinNumMaxNum = getActionDefinitionsBuilder({
610       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
611 
612   if (ST.hasVOP3PInsts()) {
613     MinNumMaxNum.customFor(FPTypesPK16)
614       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
615       .clampMaxNumElements(0, S16, 2)
616       .clampScalar(0, S16, S64)
617       .scalarize(0);
618   } else if (ST.has16BitInsts()) {
619     MinNumMaxNum.customFor(FPTypes16)
620       .clampScalar(0, S16, S64)
621       .scalarize(0);
622   } else {
623     MinNumMaxNum.customFor(FPTypesBase)
624       .clampScalar(0, S32, S64)
625       .scalarize(0);
626   }
627 
628   if (ST.hasVOP3PInsts())
629     FPOpActions.clampMaxNumElements(0, S16, 2);
630 
631   FPOpActions
632     .scalarize(0)
633     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
634 
635   TrigActions
636     .scalarize(0)
637     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
638 
639   FDIVActions
640     .scalarize(0)
641     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
642 
643   getActionDefinitionsBuilder({G_FNEG, G_FABS})
644     .legalFor(FPTypesPK16)
645     .clampMaxNumElements(0, S16, 2)
646     .scalarize(0)
647     .clampScalar(0, S16, S64);
648 
649   if (ST.has16BitInsts()) {
650     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
651       .legalFor({S32, S64, S16})
652       .scalarize(0)
653       .clampScalar(0, S16, S64);
654   } else {
655     getActionDefinitionsBuilder(G_FSQRT)
656       .legalFor({S32, S64})
657       .scalarize(0)
658       .clampScalar(0, S32, S64);
659 
660     if (ST.hasFractBug()) {
661       getActionDefinitionsBuilder(G_FFLOOR)
662         .customFor({S64})
663         .legalFor({S32, S64})
664         .scalarize(0)
665         .clampScalar(0, S32, S64);
666     } else {
667       getActionDefinitionsBuilder(G_FFLOOR)
668         .legalFor({S32, S64})
669         .scalarize(0)
670         .clampScalar(0, S32, S64);
671     }
672   }
673 
674   getActionDefinitionsBuilder(G_FPTRUNC)
675     .legalFor({{S32, S64}, {S16, S32}})
676     .scalarize(0)
677     .lower();
678 
679   getActionDefinitionsBuilder(G_FPEXT)
680     .legalFor({{S64, S32}, {S32, S16}})
681     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
682     .scalarize(0);
683 
684   getActionDefinitionsBuilder(G_FSUB)
685       // Use actual fsub instruction
686       .legalFor({S32})
687       // Must use fadd + fneg
688       .lowerFor({S64, S16, V2S16})
689       .scalarize(0)
690       .clampScalar(0, S32, S64);
691 
692   // Whether this is legal depends on the floating point mode for the function.
693   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
694   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
695     FMad.customFor({S32, S16});
696   else if (ST.hasMadMacF32Insts())
697     FMad.customFor({S32});
698   else if (ST.hasMadF16())
699     FMad.customFor({S16});
700   FMad.scalarize(0)
701       .lower();
702 
703   // TODO: Do we need to clamp maximum bitwidth?
704   getActionDefinitionsBuilder(G_TRUNC)
705     .legalIf(isScalar(0))
706     .legalFor({{V2S16, V2S32}})
707     .clampMaxNumElements(0, S16, 2)
708     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
709     // situations (like an invalid implicit use), we don't want to infinite loop
710     // in the legalizer.
711     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
712     .alwaysLegal();
713 
714   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
715     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
716                {S32, S1}, {S64, S1}, {S16, S1}})
717     .scalarize(0)
718     .clampScalar(0, S32, S64)
719     .widenScalarToNextPow2(1, 32);
720 
721   // TODO: Split s1->s64 during regbankselect for VALU.
722   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
723     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
724     .lowerFor({{S32, S64}})
725     .lowerIf(typeIs(1, S1))
726     .customFor({{S64, S64}});
727   if (ST.has16BitInsts())
728     IToFP.legalFor({{S16, S16}});
729   IToFP.clampScalar(1, S32, S64)
730        .minScalar(0, S32)
731        .scalarize(0)
732        .widenScalarToNextPow2(1);
733 
734   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
735     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
736     .customFor({{S64, S64}})
737     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
738   if (ST.has16BitInsts())
739     FPToI.legalFor({{S16, S16}});
740   else
741     FPToI.minScalar(1, S32);
742 
743   FPToI.minScalar(0, S32)
744        .scalarize(0)
745        .lower();
746 
747   // Lower roundeven into G_FRINT
748   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
749     .scalarize(0)
750     .lower();
751 
752   if (ST.has16BitInsts()) {
753     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
754       .legalFor({S16, S32, S64})
755       .clampScalar(0, S16, S64)
756       .scalarize(0);
757   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
758     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
759       .legalFor({S32, S64})
760       .clampScalar(0, S32, S64)
761       .scalarize(0);
762   } else {
763     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
764       .legalFor({S32})
765       .customFor({S64})
766       .clampScalar(0, S32, S64)
767       .scalarize(0);
768   }
769 
770   getActionDefinitionsBuilder(G_PTR_ADD)
771     .legalIf(all(isPointer(0), sameSize(0, 1)))
772     .scalarize(0)
773     .scalarSameSizeAs(1, 0);
774 
775   getActionDefinitionsBuilder(G_PTRMASK)
776     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
777     .scalarSameSizeAs(1, 0)
778     .scalarize(0);
779 
780   auto &CmpBuilder =
781     getActionDefinitionsBuilder(G_ICMP)
782     // The compare output type differs based on the register bank of the output,
783     // so make both s1 and s32 legal.
784     //
785     // Scalar compares producing output in scc will be promoted to s32, as that
786     // is the allocatable register type that will be needed for the copy from
787     // scc. This will be promoted during RegBankSelect, and we assume something
788     // before that won't try to use s32 result types.
789     //
790     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
791     // bank.
792     .legalForCartesianProduct(
793       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
794     .legalForCartesianProduct(
795       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
796   if (ST.has16BitInsts()) {
797     CmpBuilder.legalFor({{S1, S16}});
798   }
799 
800   CmpBuilder
801     .widenScalarToNextPow2(1)
802     .clampScalar(1, S32, S64)
803     .scalarize(0)
804     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
805 
806   getActionDefinitionsBuilder(G_FCMP)
807     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
808     .widenScalarToNextPow2(1)
809     .clampScalar(1, S32, S64)
810     .scalarize(0);
811 
812   // FIXME: fpow has a selection pattern that should move to custom lowering.
813   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
814   if (ST.has16BitInsts())
815     Exp2Ops.legalFor({S32, S16});
816   else
817     Exp2Ops.legalFor({S32});
818   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
819   Exp2Ops.scalarize(0);
820 
821   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
822   if (ST.has16BitInsts())
823     ExpOps.customFor({{S32}, {S16}});
824   else
825     ExpOps.customFor({S32});
826   ExpOps.clampScalar(0, MinScalarFPTy, S32)
827         .scalarize(0);
828 
829   getActionDefinitionsBuilder(G_FPOWI)
830     .clampScalar(0, MinScalarFPTy, S32)
831     .lower();
832 
833   // The 64-bit versions produce 32-bit results, but only on the SALU.
834   getActionDefinitionsBuilder(G_CTPOP)
835     .legalFor({{S32, S32}, {S32, S64}})
836     .clampScalar(0, S32, S32)
837     .clampScalar(1, S32, S64)
838     .scalarize(0)
839     .widenScalarToNextPow2(0, 32)
840     .widenScalarToNextPow2(1, 32);
841 
842   // The hardware instructions return a different result on 0 than the generic
843   // instructions expect. The hardware produces -1, but these produce the
844   // bitwidth.
845   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
846     .scalarize(0)
847     .clampScalar(0, S32, S32)
848     .clampScalar(1, S32, S64)
849     .widenScalarToNextPow2(0, 32)
850     .widenScalarToNextPow2(1, 32)
851     .lower();
852 
853   // The 64-bit versions produce 32-bit results, but only on the SALU.
854   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
855     .legalFor({{S32, S32}, {S32, S64}})
856     .clampScalar(0, S32, S32)
857     .clampScalar(1, S32, S64)
858     .scalarize(0)
859     .widenScalarToNextPow2(0, 32)
860     .widenScalarToNextPow2(1, 32);
861 
862   getActionDefinitionsBuilder(G_BITREVERSE)
863     .legalFor({S32})
864     .clampScalar(0, S32, S32)
865     .scalarize(0);
866 
867   if (ST.has16BitInsts()) {
868     getActionDefinitionsBuilder(G_BSWAP)
869       .legalFor({S16, S32, V2S16})
870       .clampMaxNumElements(0, S16, 2)
871       // FIXME: Fixing non-power-of-2 before clamp is workaround for
872       // narrowScalar limitation.
873       .widenScalarToNextPow2(0)
874       .clampScalar(0, S16, S32)
875       .scalarize(0);
876 
877     if (ST.hasVOP3PInsts()) {
878       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
879         .legalFor({S32, S16, V2S16})
880         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
881         .clampMaxNumElements(0, S16, 2)
882         .minScalar(0, S16)
883         .widenScalarToNextPow2(0)
884         .scalarize(0)
885         .lower();
886     } else {
887       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
888         .legalFor({S32, S16})
889         .widenScalarToNextPow2(0)
890         .minScalar(0, S16)
891         .scalarize(0)
892         .lower();
893     }
894   } else {
895     // TODO: Should have same legality without v_perm_b32
896     getActionDefinitionsBuilder(G_BSWAP)
897       .legalFor({S32})
898       .lowerIf(scalarNarrowerThan(0, 32))
899       // FIXME: Fixing non-power-of-2 before clamp is workaround for
900       // narrowScalar limitation.
901       .widenScalarToNextPow2(0)
902       .maxScalar(0, S32)
903       .scalarize(0)
904       .lower();
905 
906     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
907       .legalFor({S32})
908       .minScalar(0, S32)
909       .widenScalarToNextPow2(0)
910       .scalarize(0)
911       .lower();
912   }
913 
914   getActionDefinitionsBuilder(G_INTTOPTR)
915     // List the common cases
916     .legalForCartesianProduct(AddrSpaces64, {S64})
917     .legalForCartesianProduct(AddrSpaces32, {S32})
918     .scalarize(0)
919     // Accept any address space as long as the size matches
920     .legalIf(sameSize(0, 1))
921     .widenScalarIf(smallerThan(1, 0),
922       [](const LegalityQuery &Query) {
923         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
924       })
925     .narrowScalarIf(largerThan(1, 0),
926       [](const LegalityQuery &Query) {
927         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
928       });
929 
930   getActionDefinitionsBuilder(G_PTRTOINT)
931     // List the common cases
932     .legalForCartesianProduct(AddrSpaces64, {S64})
933     .legalForCartesianProduct(AddrSpaces32, {S32})
934     .scalarize(0)
935     // Accept any address space as long as the size matches
936     .legalIf(sameSize(0, 1))
937     .widenScalarIf(smallerThan(0, 1),
938       [](const LegalityQuery &Query) {
939         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
940       })
941     .narrowScalarIf(
942       largerThan(0, 1),
943       [](const LegalityQuery &Query) {
944         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
945       });
946 
947   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
948     .scalarize(0)
949     .custom();
950 
951   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
952                                     bool IsLoad) -> bool {
953     const LLT DstTy = Query.Types[0];
954 
955     // Split vector extloads.
956     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
957     unsigned Align = Query.MMODescrs[0].AlignInBits;
958 
959     if (MemSize < DstTy.getSizeInBits())
960       MemSize = std::max(MemSize, Align);
961 
962     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
963       return true;
964 
965     const LLT PtrTy = Query.Types[1];
966     unsigned AS = PtrTy.getAddressSpace();
967     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
968       return true;
969 
970     // Catch weird sized loads that don't evenly divide into the access sizes
971     // TODO: May be able to widen depending on alignment etc.
972     unsigned NumRegs = (MemSize + 31) / 32;
973     if (NumRegs == 3) {
974       if (!ST.hasDwordx3LoadStores())
975         return true;
976     } else {
977       // If the alignment allows, these should have been widened.
978       if (!isPowerOf2_32(NumRegs))
979         return true;
980     }
981 
982     if (Align < MemSize) {
983       const SITargetLowering *TLI = ST.getTargetLowering();
984       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
985     }
986 
987     return false;
988   };
989 
990   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
991                                          unsigned Opc) -> bool {
992     unsigned Size = Query.Types[0].getSizeInBits();
993     if (isPowerOf2_32(Size))
994       return false;
995 
996     if (Size == 96 && ST.hasDwordx3LoadStores())
997       return false;
998 
999     unsigned AddrSpace = Query.Types[1].getAddressSpace();
1000     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
1001       return false;
1002 
1003     unsigned Align = Query.MMODescrs[0].AlignInBits;
1004     unsigned RoundedSize = NextPowerOf2(Size);
1005     return (Align >= RoundedSize);
1006   };
1007 
1008   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
1009   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
1010   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
1011 
1012   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1013   // LDS
1014   // TODO: Unsupported flat for SI.
1015 
1016   for (unsigned Op : {G_LOAD, G_STORE}) {
1017     const bool IsStore = Op == G_STORE;
1018 
1019     auto &Actions = getActionDefinitionsBuilder(Op);
1020     // Explicitly list some common cases.
1021     // TODO: Does this help compile time at all?
1022     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1023                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1024                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1025                                       {S64, GlobalPtr, 64, GlobalAlign32},
1026                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1027                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1028                                       {S32, GlobalPtr, 8, GlobalAlign8},
1029                                       {S32, GlobalPtr, 16, GlobalAlign16},
1030 
1031                                       {S32, LocalPtr, 32, 32},
1032                                       {S64, LocalPtr, 64, 32},
1033                                       {V2S32, LocalPtr, 64, 32},
1034                                       {S32, LocalPtr, 8, 8},
1035                                       {S32, LocalPtr, 16, 16},
1036                                       {V2S16, LocalPtr, 32, 32},
1037 
1038                                       {S32, PrivatePtr, 32, 32},
1039                                       {S32, PrivatePtr, 8, 8},
1040                                       {S32, PrivatePtr, 16, 16},
1041                                       {V2S16, PrivatePtr, 32, 32},
1042 
1043                                       {S32, ConstantPtr, 32, GlobalAlign32},
1044                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1045                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1046                                       {S64, ConstantPtr, 64, GlobalAlign32},
1047                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1048     Actions.legalIf(
1049       [=](const LegalityQuery &Query) -> bool {
1050         return isLoadStoreLegal(ST, Query, Op);
1051       });
1052 
1053     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1054     // 64-bits.
1055     //
1056     // TODO: Should generalize bitcast action into coerce, which will also cover
1057     // inserting addrspacecasts.
1058     Actions.customIf(typeIs(1, Constant32Ptr));
1059 
1060     // Turn any illegal element vectors into something easier to deal
1061     // with. These will ultimately produce 32-bit scalar shifts to extract the
1062     // parts anyway.
1063     //
1064     // For odd 16-bit element vectors, prefer to split those into pieces with
1065     // 16-bit vector parts.
1066     Actions.bitcastIf(
1067       [=](const LegalityQuery &Query) -> bool {
1068         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1069                                           Query.MMODescrs[0].SizeInBits);
1070       }, bitcastToRegisterType(0));
1071 
1072     Actions
1073         .customIf(typeIs(1, Constant32Ptr))
1074         // Widen suitably aligned loads by loading extra elements.
1075         .moreElementsIf([=](const LegalityQuery &Query) {
1076             const LLT Ty = Query.Types[0];
1077             return Op == G_LOAD && Ty.isVector() &&
1078                    shouldWidenLoadResult(Query, Op);
1079           }, moreElementsToNextPow2(0))
1080         .widenScalarIf([=](const LegalityQuery &Query) {
1081             const LLT Ty = Query.Types[0];
1082             return Op == G_LOAD && !Ty.isVector() &&
1083                    shouldWidenLoadResult(Query, Op);
1084           }, widenScalarOrEltToNextPow2(0))
1085         .narrowScalarIf(
1086             [=](const LegalityQuery &Query) -> bool {
1087               return !Query.Types[0].isVector() &&
1088                      needToSplitMemOp(Query, Op == G_LOAD);
1089             },
1090             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1091               const LLT DstTy = Query.Types[0];
1092               const LLT PtrTy = Query.Types[1];
1093 
1094               const unsigned DstSize = DstTy.getSizeInBits();
1095               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1096 
1097               // Split extloads.
1098               if (DstSize > MemSize)
1099                 return std::make_pair(0, LLT::scalar(MemSize));
1100 
1101               if (!isPowerOf2_32(DstSize)) {
1102                 // We're probably decomposing an odd sized store. Try to split
1103                 // to the widest type. TODO: Account for alignment. As-is it
1104                 // should be OK, since the new parts will be further legalized.
1105                 unsigned FloorSize = PowerOf2Floor(DstSize);
1106                 return std::make_pair(0, LLT::scalar(FloorSize));
1107               }
1108 
1109               if (DstSize > 32 && (DstSize % 32 != 0)) {
1110                 // FIXME: Need a way to specify non-extload of larger size if
1111                 // suitably aligned.
1112                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1113               }
1114 
1115               unsigned MaxSize = maxSizeForAddrSpace(ST,
1116                                                      PtrTy.getAddressSpace(),
1117                                                      Op == G_LOAD);
1118               if (MemSize > MaxSize)
1119                 return std::make_pair(0, LLT::scalar(MaxSize));
1120 
1121               unsigned Align = Query.MMODescrs[0].AlignInBits;
1122               return std::make_pair(0, LLT::scalar(Align));
1123             })
1124         .fewerElementsIf(
1125             [=](const LegalityQuery &Query) -> bool {
1126               return Query.Types[0].isVector() &&
1127                      needToSplitMemOp(Query, Op == G_LOAD);
1128             },
1129             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1130               const LLT DstTy = Query.Types[0];
1131               const LLT PtrTy = Query.Types[1];
1132 
1133               LLT EltTy = DstTy.getElementType();
1134               unsigned MaxSize = maxSizeForAddrSpace(ST,
1135                                                      PtrTy.getAddressSpace(),
1136                                                      Op == G_LOAD);
1137 
1138               // FIXME: Handle widened to power of 2 results better. This ends
1139               // up scalarizing.
1140               // FIXME: 3 element stores scalarized on SI
1141 
1142               // Split if it's too large for the address space.
1143               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1144                 unsigned NumElts = DstTy.getNumElements();
1145                 unsigned EltSize = EltTy.getSizeInBits();
1146 
1147                 if (MaxSize % EltSize == 0) {
1148                   return std::make_pair(
1149                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1150                 }
1151 
1152                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1153 
1154                 // FIXME: Refine when odd breakdowns handled
1155                 // The scalars will need to be re-legalized.
1156                 if (NumPieces == 1 || NumPieces >= NumElts ||
1157                     NumElts % NumPieces != 0)
1158                   return std::make_pair(0, EltTy);
1159 
1160                 return std::make_pair(0,
1161                                       LLT::vector(NumElts / NumPieces, EltTy));
1162               }
1163 
1164               // FIXME: We could probably handle weird extending loads better.
1165               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1166               if (DstTy.getSizeInBits() > MemSize)
1167                 return std::make_pair(0, EltTy);
1168 
1169               unsigned EltSize = EltTy.getSizeInBits();
1170               unsigned DstSize = DstTy.getSizeInBits();
1171               if (!isPowerOf2_32(DstSize)) {
1172                 // We're probably decomposing an odd sized store. Try to split
1173                 // to the widest type. TODO: Account for alignment. As-is it
1174                 // should be OK, since the new parts will be further legalized.
1175                 unsigned FloorSize = PowerOf2Floor(DstSize);
1176                 return std::make_pair(
1177                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1178               }
1179 
1180               // Need to split because of alignment.
1181               unsigned Align = Query.MMODescrs[0].AlignInBits;
1182               if (EltSize > Align &&
1183                   (EltSize / Align < DstTy.getNumElements())) {
1184                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1185               }
1186 
1187               // May need relegalization for the scalars.
1188               return std::make_pair(0, EltTy);
1189             })
1190         .minScalar(0, S32);
1191 
1192     if (IsStore)
1193       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1194 
1195     // TODO: Need a bitcast lower option?
1196     Actions
1197         .widenScalarToNextPow2(0)
1198         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1199   }
1200 
1201   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1202                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1203                                                   {S32, GlobalPtr, 16, 2 * 8},
1204                                                   {S32, LocalPtr, 8, 8},
1205                                                   {S32, LocalPtr, 16, 16},
1206                                                   {S32, PrivatePtr, 8, 8},
1207                                                   {S32, PrivatePtr, 16, 16},
1208                                                   {S32, ConstantPtr, 8, 8},
1209                                                   {S32, ConstantPtr, 16, 2 * 8}});
1210   if (ST.hasFlatAddressSpace()) {
1211     ExtLoads.legalForTypesWithMemDesc(
1212         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1213   }
1214 
1215   ExtLoads.clampScalar(0, S32, S32)
1216           .widenScalarToNextPow2(0)
1217           .unsupportedIfMemSizeNotPow2()
1218           .lower();
1219 
1220   auto &Atomics = getActionDefinitionsBuilder(
1221     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1222      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1223      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1224      G_ATOMICRMW_UMIN})
1225     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1226                {S64, GlobalPtr}, {S64, LocalPtr},
1227                {S32, RegionPtr}, {S64, RegionPtr}});
1228   if (ST.hasFlatAddressSpace()) {
1229     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1230   }
1231 
1232   if (ST.hasLDSFPAtomics()) {
1233     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1234       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1235   }
1236 
1237   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1238   // demarshalling
1239   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1240     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1241                 {S32, FlatPtr}, {S64, FlatPtr}})
1242     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1243                {S32, RegionPtr}, {S64, RegionPtr}});
1244   // TODO: Pointer types, any 32-bit or 64-bit vector
1245 
1246   // Condition should be s32 for scalar, s1 for vector.
1247   getActionDefinitionsBuilder(G_SELECT)
1248     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1249           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1250           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1251     .clampScalar(0, S16, S64)
1252     .scalarize(1)
1253     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1254     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1255     .clampMaxNumElements(0, S32, 2)
1256     .clampMaxNumElements(0, LocalPtr, 2)
1257     .clampMaxNumElements(0, PrivatePtr, 2)
1258     .scalarize(0)
1259     .widenScalarToNextPow2(0)
1260     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1261 
1262   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1263   // be more flexible with the shift amount type.
1264   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1265     .legalFor({{S32, S32}, {S64, S32}});
1266   if (ST.has16BitInsts()) {
1267     if (ST.hasVOP3PInsts()) {
1268       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1269             .clampMaxNumElements(0, S16, 2);
1270     } else
1271       Shifts.legalFor({{S16, S16}});
1272 
1273     // TODO: Support 16-bit shift amounts for all types
1274     Shifts.widenScalarIf(
1275       [=](const LegalityQuery &Query) {
1276         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1277         // 32-bit amount.
1278         const LLT ValTy = Query.Types[0];
1279         const LLT AmountTy = Query.Types[1];
1280         return ValTy.getSizeInBits() <= 16 &&
1281                AmountTy.getSizeInBits() < 16;
1282       }, changeTo(1, S16));
1283     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1284     Shifts.clampScalar(1, S32, S32);
1285     Shifts.clampScalar(0, S16, S64);
1286     Shifts.widenScalarToNextPow2(0, 16);
1287   } else {
1288     // Make sure we legalize the shift amount type first, as the general
1289     // expansion for the shifted type will produce much worse code if it hasn't
1290     // been truncated already.
1291     Shifts.clampScalar(1, S32, S32);
1292     Shifts.clampScalar(0, S32, S64);
1293     Shifts.widenScalarToNextPow2(0, 32);
1294   }
1295   Shifts.scalarize(0);
1296 
1297   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1298     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1299     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1300     unsigned IdxTypeIdx = 2;
1301 
1302     getActionDefinitionsBuilder(Op)
1303       .customIf([=](const LegalityQuery &Query) {
1304           const LLT EltTy = Query.Types[EltTypeIdx];
1305           const LLT VecTy = Query.Types[VecTypeIdx];
1306           const LLT IdxTy = Query.Types[IdxTypeIdx];
1307           const unsigned EltSize = EltTy.getSizeInBits();
1308           return (EltSize == 32 || EltSize == 64) &&
1309                   VecTy.getSizeInBits() % 32 == 0 &&
1310                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1311                   IdxTy.getSizeInBits() == 32;
1312         })
1313       .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)),
1314                  bitcastToVectorElement32(1))
1315       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1316       .bitcastIf(
1317         all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)),
1318         [=](const LegalityQuery &Query) {
1319           // For > 64-bit element types, try to turn this into a 64-bit
1320           // element vector since we may be able to do better indexing
1321           // if this is scalar. If not, fall back to 32.
1322           const LLT EltTy = Query.Types[EltTypeIdx];
1323           const LLT VecTy = Query.Types[VecTypeIdx];
1324           const unsigned DstEltSize = EltTy.getSizeInBits();
1325           const unsigned VecSize = VecTy.getSizeInBits();
1326 
1327           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1328           return std::make_pair(
1329             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1330         })
1331       .clampScalar(EltTypeIdx, S32, S64)
1332       .clampScalar(VecTypeIdx, S32, S64)
1333       .clampScalar(IdxTypeIdx, S32, S32)
1334       // TODO: Clamp the number of elements before resorting to stack lowering.
1335       // It should only be necessary with variable indexes.
1336       // As a last resort, lower to the stack
1337       .lower();
1338   }
1339 
1340   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1341     .unsupportedIf([=](const LegalityQuery &Query) {
1342         const LLT &EltTy = Query.Types[1].getElementType();
1343         return Query.Types[0] != EltTy;
1344       });
1345 
1346   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1347     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1348     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1349 
1350     // FIXME: Doesn't handle extract of illegal sizes.
1351     getActionDefinitionsBuilder(Op)
1352       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1353       // FIXME: Multiples of 16 should not be legal.
1354       .legalIf([=](const LegalityQuery &Query) {
1355           const LLT BigTy = Query.Types[BigTyIdx];
1356           const LLT LitTy = Query.Types[LitTyIdx];
1357           return (BigTy.getSizeInBits() % 32 == 0) &&
1358                  (LitTy.getSizeInBits() % 16 == 0);
1359         })
1360       .widenScalarIf(
1361         [=](const LegalityQuery &Query) {
1362           const LLT BigTy = Query.Types[BigTyIdx];
1363           return (BigTy.getScalarSizeInBits() < 16);
1364         },
1365         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1366       .widenScalarIf(
1367         [=](const LegalityQuery &Query) {
1368           const LLT LitTy = Query.Types[LitTyIdx];
1369           return (LitTy.getScalarSizeInBits() < 16);
1370         },
1371         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1372       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1373       .widenScalarToNextPow2(BigTyIdx, 32);
1374 
1375   }
1376 
1377   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1378     .legalForCartesianProduct(AllS32Vectors, {S32})
1379     .legalForCartesianProduct(AllS64Vectors, {S64})
1380     .clampNumElements(0, V16S32, V32S32)
1381     .clampNumElements(0, V2S64, V16S64)
1382     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1383 
1384   if (ST.hasScalarPackInsts()) {
1385     BuildVector
1386       // FIXME: Should probably widen s1 vectors straight to s32
1387       .minScalarOrElt(0, S16)
1388       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1389       .minScalar(1, S32);
1390 
1391     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1392       .legalFor({V2S16, S32})
1393       .lower();
1394     BuildVector.minScalarOrElt(0, S32);
1395   } else {
1396     BuildVector.customFor({V2S16, S16});
1397     BuildVector.minScalarOrElt(0, S32);
1398 
1399     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1400       .customFor({V2S16, S32})
1401       .lower();
1402   }
1403 
1404   BuildVector.legalIf(isRegisterType(0));
1405 
1406   // FIXME: Clamp maximum size
1407   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1408     .legalIf(isRegisterType(0));
1409 
1410   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1411   // pre-legalize.
1412   if (ST.hasVOP3PInsts()) {
1413     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1414       .customFor({V2S16, V2S16})
1415       .lower();
1416   } else
1417     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1418 
1419   // Merge/Unmerge
1420   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1421     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1422     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1423 
1424     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1425       const LLT Ty = Query.Types[TypeIdx];
1426       if (Ty.isVector()) {
1427         const LLT &EltTy = Ty.getElementType();
1428         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1429           return true;
1430         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1431           return true;
1432       }
1433       return false;
1434     };
1435 
1436     auto &Builder = getActionDefinitionsBuilder(Op)
1437       .lowerFor({{S16, V2S16}})
1438       .lowerIf([=](const LegalityQuery &Query) {
1439           const LLT BigTy = Query.Types[BigTyIdx];
1440           return BigTy.getSizeInBits() == 32;
1441         })
1442       // Try to widen to s16 first for small types.
1443       // TODO: Only do this on targets with legal s16 shifts
1444       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1445       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1446       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1447       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1448                            elementTypeIs(1, S16)),
1449                        changeTo(1, V2S16))
1450       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1451       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1452       // valid.
1453       .clampScalar(LitTyIdx, S32, S512)
1454       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1455       // Break up vectors with weird elements into scalars
1456       .fewerElementsIf(
1457         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1458         scalarize(0))
1459       .fewerElementsIf(
1460         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1461         scalarize(1))
1462       .clampScalar(BigTyIdx, S32, MaxScalar);
1463 
1464     if (Op == G_MERGE_VALUES) {
1465       Builder.widenScalarIf(
1466         // TODO: Use 16-bit shifts if legal for 8-bit values?
1467         [=](const LegalityQuery &Query) {
1468           const LLT Ty = Query.Types[LitTyIdx];
1469           return Ty.getSizeInBits() < 32;
1470         },
1471         changeTo(LitTyIdx, S32));
1472     }
1473 
1474     Builder.widenScalarIf(
1475       [=](const LegalityQuery &Query) {
1476         const LLT Ty = Query.Types[BigTyIdx];
1477         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1478           Ty.getSizeInBits() % 16 != 0;
1479       },
1480       [=](const LegalityQuery &Query) {
1481         // Pick the next power of 2, or a multiple of 64 over 128.
1482         // Whichever is smaller.
1483         const LLT &Ty = Query.Types[BigTyIdx];
1484         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1485         if (NewSizeInBits >= 256) {
1486           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1487           if (RoundedTo < NewSizeInBits)
1488             NewSizeInBits = RoundedTo;
1489         }
1490         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1491       })
1492       .legalIf([=](const LegalityQuery &Query) {
1493           const LLT &BigTy = Query.Types[BigTyIdx];
1494           const LLT &LitTy = Query.Types[LitTyIdx];
1495 
1496           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1497             return false;
1498           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1499             return false;
1500 
1501           return BigTy.getSizeInBits() % 16 == 0 &&
1502                  LitTy.getSizeInBits() % 16 == 0 &&
1503                  BigTy.getSizeInBits() <= MaxRegisterSize;
1504         })
1505       // Any vectors left are the wrong size. Scalarize them.
1506       .scalarize(0)
1507       .scalarize(1);
1508   }
1509 
1510   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1511   // RegBankSelect.
1512   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1513     .legalFor({{S32}, {S64}});
1514 
1515   if (ST.hasVOP3PInsts()) {
1516     SextInReg.lowerFor({{V2S16}})
1517       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1518       // get more vector shift opportunities, since we'll get those when
1519       // expanded.
1520       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1521   } else if (ST.has16BitInsts()) {
1522     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1523   } else {
1524     // Prefer to promote to s32 before lowering if we don't have 16-bit
1525     // shifts. This avoid a lot of intermediate truncate and extend operations.
1526     SextInReg.lowerFor({{S32}, {S64}});
1527   }
1528 
1529   SextInReg
1530     .scalarize(0)
1531     .clampScalar(0, S32, S64)
1532     .lower();
1533 
1534   getActionDefinitionsBuilder(G_FSHR)
1535     .legalFor({{S32, S32}})
1536     .scalarize(0)
1537     .lower();
1538 
1539   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1540     .legalFor({S64});
1541 
1542   getActionDefinitionsBuilder(G_FENCE)
1543     .alwaysLegal();
1544 
1545   getActionDefinitionsBuilder({
1546       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1547       G_FCOPYSIGN,
1548 
1549       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1550       G_ATOMICRMW_NAND,
1551       G_ATOMICRMW_FSUB,
1552       G_READ_REGISTER,
1553       G_WRITE_REGISTER,
1554 
1555       G_SADDO, G_SSUBO,
1556 
1557        // TODO: Implement
1558       G_FMINIMUM, G_FMAXIMUM,
1559       G_FSHL
1560     }).lower();
1561 
1562   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1563         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1564         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1565     .unsupported();
1566 
1567   computeTables();
1568   verify(*ST.getInstrInfo());
1569 }
1570 
1571 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1572                                          MachineInstr &MI) const {
1573   MachineIRBuilder &B = Helper.MIRBuilder;
1574   MachineRegisterInfo &MRI = *B.getMRI();
1575   GISelChangeObserver &Observer = Helper.Observer;
1576 
1577   switch (MI.getOpcode()) {
1578   case TargetOpcode::G_ADDRSPACE_CAST:
1579     return legalizeAddrSpaceCast(MI, MRI, B);
1580   case TargetOpcode::G_FRINT:
1581     return legalizeFrint(MI, MRI, B);
1582   case TargetOpcode::G_FCEIL:
1583     return legalizeFceil(MI, MRI, B);
1584   case TargetOpcode::G_INTRINSIC_TRUNC:
1585     return legalizeIntrinsicTrunc(MI, MRI, B);
1586   case TargetOpcode::G_SITOFP:
1587     return legalizeITOFP(MI, MRI, B, true);
1588   case TargetOpcode::G_UITOFP:
1589     return legalizeITOFP(MI, MRI, B, false);
1590   case TargetOpcode::G_FPTOSI:
1591     return legalizeFPTOI(MI, MRI, B, true);
1592   case TargetOpcode::G_FPTOUI:
1593     return legalizeFPTOI(MI, MRI, B, false);
1594   case TargetOpcode::G_FMINNUM:
1595   case TargetOpcode::G_FMAXNUM:
1596   case TargetOpcode::G_FMINNUM_IEEE:
1597   case TargetOpcode::G_FMAXNUM_IEEE:
1598     return legalizeMinNumMaxNum(Helper, MI);
1599   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1600     return legalizeExtractVectorElt(MI, MRI, B);
1601   case TargetOpcode::G_INSERT_VECTOR_ELT:
1602     return legalizeInsertVectorElt(MI, MRI, B);
1603   case TargetOpcode::G_SHUFFLE_VECTOR:
1604     return legalizeShuffleVector(MI, MRI, B);
1605   case TargetOpcode::G_FSIN:
1606   case TargetOpcode::G_FCOS:
1607     return legalizeSinCos(MI, MRI, B);
1608   case TargetOpcode::G_GLOBAL_VALUE:
1609     return legalizeGlobalValue(MI, MRI, B);
1610   case TargetOpcode::G_LOAD:
1611     return legalizeLoad(MI, MRI, B, Observer);
1612   case TargetOpcode::G_FMAD:
1613     return legalizeFMad(MI, MRI, B);
1614   case TargetOpcode::G_FDIV:
1615     return legalizeFDIV(MI, MRI, B);
1616   case TargetOpcode::G_UDIV:
1617   case TargetOpcode::G_UREM:
1618     return legalizeUDIV_UREM(MI, MRI, B);
1619   case TargetOpcode::G_SDIV:
1620   case TargetOpcode::G_SREM:
1621     return legalizeSDIV_SREM(MI, MRI, B);
1622   case TargetOpcode::G_ATOMIC_CMPXCHG:
1623     return legalizeAtomicCmpXChg(MI, MRI, B);
1624   case TargetOpcode::G_FLOG:
1625     return legalizeFlog(MI, B, numbers::ln2f);
1626   case TargetOpcode::G_FLOG10:
1627     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1628   case TargetOpcode::G_FEXP:
1629     return legalizeFExp(MI, B);
1630   case TargetOpcode::G_FPOW:
1631     return legalizeFPow(MI, B);
1632   case TargetOpcode::G_FFLOOR:
1633     return legalizeFFloor(MI, MRI, B);
1634   case TargetOpcode::G_BUILD_VECTOR:
1635     return legalizeBuildVector(MI, MRI, B);
1636   default:
1637     return false;
1638   }
1639 
1640   llvm_unreachable("expected switch to return");
1641 }
1642 
1643 Register AMDGPULegalizerInfo::getSegmentAperture(
1644   unsigned AS,
1645   MachineRegisterInfo &MRI,
1646   MachineIRBuilder &B) const {
1647   MachineFunction &MF = B.getMF();
1648   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1649   const LLT S32 = LLT::scalar(32);
1650 
1651   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1652 
1653   if (ST.hasApertureRegs()) {
1654     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1655     // getreg.
1656     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1657         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1658         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1659     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1660         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1661         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1662     unsigned Encoding =
1663         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1664         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1665         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1666 
1667     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1668 
1669     B.buildInstr(AMDGPU::S_GETREG_B32)
1670       .addDef(GetReg)
1671       .addImm(Encoding);
1672     MRI.setType(GetReg, S32);
1673 
1674     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1675     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1676   }
1677 
1678   Register QueuePtr = MRI.createGenericVirtualRegister(
1679     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1680 
1681   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1682     return Register();
1683 
1684   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1685   // private_segment_aperture_base_hi.
1686   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1687 
1688   // TODO: can we be smarter about machine pointer info?
1689   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1690   MachineMemOperand *MMO = MF.getMachineMemOperand(
1691       PtrInfo,
1692       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1693           MachineMemOperand::MOInvariant,
1694       4, commonAlignment(Align(64), StructOffset));
1695 
1696   Register LoadAddr;
1697 
1698   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1699   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1700 }
1701 
1702 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1703   MachineInstr &MI, MachineRegisterInfo &MRI,
1704   MachineIRBuilder &B) const {
1705   MachineFunction &MF = B.getMF();
1706 
1707   const LLT S32 = LLT::scalar(32);
1708   Register Dst = MI.getOperand(0).getReg();
1709   Register Src = MI.getOperand(1).getReg();
1710 
1711   LLT DstTy = MRI.getType(Dst);
1712   LLT SrcTy = MRI.getType(Src);
1713   unsigned DestAS = DstTy.getAddressSpace();
1714   unsigned SrcAS = SrcTy.getAddressSpace();
1715 
1716   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1717   // vector element.
1718   assert(!DstTy.isVector());
1719 
1720   const AMDGPUTargetMachine &TM
1721     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1722 
1723   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1724     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1725     return true;
1726   }
1727 
1728   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1729     // Truncate.
1730     B.buildExtract(Dst, Src, 0);
1731     MI.eraseFromParent();
1732     return true;
1733   }
1734 
1735   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1736     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1737     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1738 
1739     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1740     // another. Merge operands are required to be the same type, but creating an
1741     // extra ptrtoint would be kind of pointless.
1742     auto HighAddr = B.buildConstant(
1743       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1744     B.buildMerge(Dst, {Src, HighAddr});
1745     MI.eraseFromParent();
1746     return true;
1747   }
1748 
1749   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1750     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1751            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1752     unsigned NullVal = TM.getNullPointerValue(DestAS);
1753 
1754     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1755     auto FlatNull = B.buildConstant(SrcTy, 0);
1756 
1757     // Extract low 32-bits of the pointer.
1758     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1759 
1760     auto CmpRes =
1761         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1762     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1763 
1764     MI.eraseFromParent();
1765     return true;
1766   }
1767 
1768   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1769     return false;
1770 
1771   if (!ST.hasFlatAddressSpace())
1772     return false;
1773 
1774   auto SegmentNull =
1775       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1776   auto FlatNull =
1777       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1778 
1779   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1780   if (!ApertureReg.isValid())
1781     return false;
1782 
1783   auto CmpRes =
1784       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1785 
1786   // Coerce the type of the low half of the result so we can use merge_values.
1787   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1788 
1789   // TODO: Should we allow mismatched types but matching sizes in merges to
1790   // avoid the ptrtoint?
1791   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1792   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1793 
1794   MI.eraseFromParent();
1795   return true;
1796 }
1797 
1798 bool AMDGPULegalizerInfo::legalizeFrint(
1799   MachineInstr &MI, MachineRegisterInfo &MRI,
1800   MachineIRBuilder &B) const {
1801   Register Src = MI.getOperand(1).getReg();
1802   LLT Ty = MRI.getType(Src);
1803   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1804 
1805   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1806   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1807 
1808   auto C1 = B.buildFConstant(Ty, C1Val);
1809   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1810 
1811   // TODO: Should this propagate fast-math-flags?
1812   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1813   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1814 
1815   auto C2 = B.buildFConstant(Ty, C2Val);
1816   auto Fabs = B.buildFAbs(Ty, Src);
1817 
1818   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1819   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1820   MI.eraseFromParent();
1821   return true;
1822 }
1823 
1824 bool AMDGPULegalizerInfo::legalizeFceil(
1825   MachineInstr &MI, MachineRegisterInfo &MRI,
1826   MachineIRBuilder &B) const {
1827 
1828   const LLT S1 = LLT::scalar(1);
1829   const LLT S64 = LLT::scalar(64);
1830 
1831   Register Src = MI.getOperand(1).getReg();
1832   assert(MRI.getType(Src) == S64);
1833 
1834   // result = trunc(src)
1835   // if (src > 0.0 && src != result)
1836   //   result += 1.0
1837 
1838   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1839 
1840   const auto Zero = B.buildFConstant(S64, 0.0);
1841   const auto One = B.buildFConstant(S64, 1.0);
1842   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1843   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1844   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1845   auto Add = B.buildSelect(S64, And, One, Zero);
1846 
1847   // TODO: Should this propagate fast-math-flags?
1848   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1849   return true;
1850 }
1851 
1852 static MachineInstrBuilder extractF64Exponent(Register Hi,
1853                                               MachineIRBuilder &B) {
1854   const unsigned FractBits = 52;
1855   const unsigned ExpBits = 11;
1856   LLT S32 = LLT::scalar(32);
1857 
1858   auto Const0 = B.buildConstant(S32, FractBits - 32);
1859   auto Const1 = B.buildConstant(S32, ExpBits);
1860 
1861   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1862     .addUse(Hi)
1863     .addUse(Const0.getReg(0))
1864     .addUse(Const1.getReg(0));
1865 
1866   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1867 }
1868 
1869 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1870   MachineInstr &MI, MachineRegisterInfo &MRI,
1871   MachineIRBuilder &B) const {
1872   const LLT S1 = LLT::scalar(1);
1873   const LLT S32 = LLT::scalar(32);
1874   const LLT S64 = LLT::scalar(64);
1875 
1876   Register Src = MI.getOperand(1).getReg();
1877   assert(MRI.getType(Src) == S64);
1878 
1879   // TODO: Should this use extract since the low half is unused?
1880   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1881   Register Hi = Unmerge.getReg(1);
1882 
1883   // Extract the upper half, since this is where we will find the sign and
1884   // exponent.
1885   auto Exp = extractF64Exponent(Hi, B);
1886 
1887   const unsigned FractBits = 52;
1888 
1889   // Extract the sign bit.
1890   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1891   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1892 
1893   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1894 
1895   const auto Zero32 = B.buildConstant(S32, 0);
1896 
1897   // Extend back to 64-bits.
1898   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1899 
1900   auto Shr = B.buildAShr(S64, FractMask, Exp);
1901   auto Not = B.buildNot(S64, Shr);
1902   auto Tmp0 = B.buildAnd(S64, Src, Not);
1903   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1904 
1905   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1906   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1907 
1908   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1909   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1910   MI.eraseFromParent();
1911   return true;
1912 }
1913 
1914 bool AMDGPULegalizerInfo::legalizeITOFP(
1915   MachineInstr &MI, MachineRegisterInfo &MRI,
1916   MachineIRBuilder &B, bool Signed) const {
1917 
1918   Register Dst = MI.getOperand(0).getReg();
1919   Register Src = MI.getOperand(1).getReg();
1920 
1921   const LLT S64 = LLT::scalar(64);
1922   const LLT S32 = LLT::scalar(32);
1923 
1924   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1925 
1926   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1927 
1928   auto CvtHi = Signed ?
1929     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1930     B.buildUITOFP(S64, Unmerge.getReg(1));
1931 
1932   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1933 
1934   auto ThirtyTwo = B.buildConstant(S32, 32);
1935   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1936     .addUse(CvtHi.getReg(0))
1937     .addUse(ThirtyTwo.getReg(0));
1938 
1939   // TODO: Should this propagate fast-math-flags?
1940   B.buildFAdd(Dst, LdExp, CvtLo);
1941   MI.eraseFromParent();
1942   return true;
1943 }
1944 
1945 // TODO: Copied from DAG implementation. Verify logic and document how this
1946 // actually works.
1947 bool AMDGPULegalizerInfo::legalizeFPTOI(
1948   MachineInstr &MI, MachineRegisterInfo &MRI,
1949   MachineIRBuilder &B, bool Signed) const {
1950 
1951   Register Dst = MI.getOperand(0).getReg();
1952   Register Src = MI.getOperand(1).getReg();
1953 
1954   const LLT S64 = LLT::scalar(64);
1955   const LLT S32 = LLT::scalar(32);
1956 
1957   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1958 
1959   unsigned Flags = MI.getFlags();
1960 
1961   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1962   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1963   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1964 
1965   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1966   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1967   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1968 
1969   auto Hi = Signed ?
1970     B.buildFPTOSI(S32, FloorMul) :
1971     B.buildFPTOUI(S32, FloorMul);
1972   auto Lo = B.buildFPTOUI(S32, Fma);
1973 
1974   B.buildMerge(Dst, { Lo, Hi });
1975   MI.eraseFromParent();
1976 
1977   return true;
1978 }
1979 
1980 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1981                                                MachineInstr &MI) const {
1982   MachineFunction &MF = Helper.MIRBuilder.getMF();
1983   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1984 
1985   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1986                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1987 
1988   // With ieee_mode disabled, the instructions have the correct behavior
1989   // already for G_FMINNUM/G_FMAXNUM
1990   if (!MFI->getMode().IEEE)
1991     return !IsIEEEOp;
1992 
1993   if (IsIEEEOp)
1994     return true;
1995 
1996   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2000   MachineInstr &MI, MachineRegisterInfo &MRI,
2001   MachineIRBuilder &B) const {
2002   // TODO: Should move some of this into LegalizerHelper.
2003 
2004   // TODO: Promote dynamic indexing of s16 to s32
2005 
2006   // FIXME: Artifact combiner probably should have replaced the truncated
2007   // constant before this, so we shouldn't need
2008   // getConstantVRegValWithLookThrough.
2009   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2010     MI.getOperand(2).getReg(), MRI);
2011   if (!IdxVal) // Dynamic case will be selected to register indexing.
2012     return true;
2013 
2014   Register Dst = MI.getOperand(0).getReg();
2015   Register Vec = MI.getOperand(1).getReg();
2016 
2017   LLT VecTy = MRI.getType(Vec);
2018   LLT EltTy = VecTy.getElementType();
2019   assert(EltTy == MRI.getType(Dst));
2020 
2021   if (IdxVal->Value < VecTy.getNumElements())
2022     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
2023   else
2024     B.buildUndef(Dst);
2025 
2026   MI.eraseFromParent();
2027   return true;
2028 }
2029 
2030 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2031   MachineInstr &MI, MachineRegisterInfo &MRI,
2032   MachineIRBuilder &B) const {
2033   // TODO: Should move some of this into LegalizerHelper.
2034 
2035   // TODO: Promote dynamic indexing of s16 to s32
2036 
2037   // FIXME: Artifact combiner probably should have replaced the truncated
2038   // constant before this, so we shouldn't need
2039   // getConstantVRegValWithLookThrough.
2040   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
2041     MI.getOperand(3).getReg(), MRI);
2042   if (!IdxVal) // Dynamic case will be selected to register indexing.
2043     return true;
2044 
2045   Register Dst = MI.getOperand(0).getReg();
2046   Register Vec = MI.getOperand(1).getReg();
2047   Register Ins = MI.getOperand(2).getReg();
2048 
2049   LLT VecTy = MRI.getType(Vec);
2050   LLT EltTy = VecTy.getElementType();
2051   assert(EltTy == MRI.getType(Ins));
2052 
2053   if (IdxVal->Value < VecTy.getNumElements())
2054     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2055   else
2056     B.buildUndef(Dst);
2057 
2058   MI.eraseFromParent();
2059   return true;
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2063   MachineInstr &MI, MachineRegisterInfo &MRI,
2064   MachineIRBuilder &B) const {
2065   const LLT V2S16 = LLT::vector(2, 16);
2066 
2067   Register Dst = MI.getOperand(0).getReg();
2068   Register Src0 = MI.getOperand(1).getReg();
2069   LLT DstTy = MRI.getType(Dst);
2070   LLT SrcTy = MRI.getType(Src0);
2071 
2072   if (SrcTy == V2S16 && DstTy == V2S16 &&
2073       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2074     return true;
2075 
2076   MachineIRBuilder HelperBuilder(MI);
2077   GISelObserverWrapper DummyObserver;
2078   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2079   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2080 }
2081 
2082 bool AMDGPULegalizerInfo::legalizeSinCos(
2083   MachineInstr &MI, MachineRegisterInfo &MRI,
2084   MachineIRBuilder &B) const {
2085 
2086   Register DstReg = MI.getOperand(0).getReg();
2087   Register SrcReg = MI.getOperand(1).getReg();
2088   LLT Ty = MRI.getType(DstReg);
2089   unsigned Flags = MI.getFlags();
2090 
2091   Register TrigVal;
2092   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2093   if (ST.hasTrigReducedRange()) {
2094     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2095     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2096       .addUse(MulVal.getReg(0))
2097       .setMIFlags(Flags).getReg(0);
2098   } else
2099     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2100 
2101   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2102     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2103   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2104     .addUse(TrigVal)
2105     .setMIFlags(Flags);
2106   MI.eraseFromParent();
2107   return true;
2108 }
2109 
2110 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2111                                                   MachineIRBuilder &B,
2112                                                   const GlobalValue *GV,
2113                                                   int64_t Offset,
2114                                                   unsigned GAFlags) const {
2115   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2116   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2117   // to the following code sequence:
2118   //
2119   // For constant address space:
2120   //   s_getpc_b64 s[0:1]
2121   //   s_add_u32 s0, s0, $symbol
2122   //   s_addc_u32 s1, s1, 0
2123   //
2124   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2125   //   a fixup or relocation is emitted to replace $symbol with a literal
2126   //   constant, which is a pc-relative offset from the encoding of the $symbol
2127   //   operand to the global variable.
2128   //
2129   // For global address space:
2130   //   s_getpc_b64 s[0:1]
2131   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2132   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2133   //
2134   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2135   //   fixups or relocations are emitted to replace $symbol@*@lo and
2136   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2137   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2138   //   operand to the global variable.
2139   //
2140   // What we want here is an offset from the value returned by s_getpc
2141   // (which is the address of the s_add_u32 instruction) to the global
2142   // variable, but since the encoding of $symbol starts 4 bytes after the start
2143   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2144   // small. This requires us to add 4 to the global variable offset in order to
2145   // compute the correct address.
2146 
2147   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2148 
2149   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2150     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2151 
2152   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2153     .addDef(PCReg);
2154 
2155   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2156   if (GAFlags == SIInstrInfo::MO_NONE)
2157     MIB.addImm(0);
2158   else
2159     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2160 
2161   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2162 
2163   if (PtrTy.getSizeInBits() == 32)
2164     B.buildExtract(DstReg, PCReg, 0);
2165   return true;
2166  }
2167 
2168 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2169   MachineInstr &MI, MachineRegisterInfo &MRI,
2170   MachineIRBuilder &B) const {
2171   Register DstReg = MI.getOperand(0).getReg();
2172   LLT Ty = MRI.getType(DstReg);
2173   unsigned AS = Ty.getAddressSpace();
2174 
2175   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2176   MachineFunction &MF = B.getMF();
2177   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2178 
2179   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2180     if (!MFI->isEntryFunction()) {
2181       const Function &Fn = MF.getFunction();
2182       DiagnosticInfoUnsupported BadLDSDecl(
2183         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2184         DS_Warning);
2185       Fn.getContext().diagnose(BadLDSDecl);
2186 
2187       // We currently don't have a way to correctly allocate LDS objects that
2188       // aren't directly associated with a kernel. We do force inlining of
2189       // functions that use local objects. However, if these dead functions are
2190       // not eliminated, we don't want a compile time error. Just emit a warning
2191       // and a trap, since there should be no callable path here.
2192       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2193       B.buildUndef(DstReg);
2194       MI.eraseFromParent();
2195       return true;
2196     }
2197 
2198     // TODO: We could emit code to handle the initialization somewhere.
2199     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2200       const SITargetLowering *TLI = ST.getTargetLowering();
2201       if (!TLI->shouldUseLDSConstAddress(GV)) {
2202         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2203         return true; // Leave in place;
2204       }
2205 
2206       B.buildConstant(
2207           DstReg,
2208           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2209       MI.eraseFromParent();
2210       return true;
2211     }
2212 
2213     const Function &Fn = MF.getFunction();
2214     DiagnosticInfoUnsupported BadInit(
2215       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2216     Fn.getContext().diagnose(BadInit);
2217     return true;
2218   }
2219 
2220   const SITargetLowering *TLI = ST.getTargetLowering();
2221 
2222   if (TLI->shouldEmitFixup(GV)) {
2223     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2224     MI.eraseFromParent();
2225     return true;
2226   }
2227 
2228   if (TLI->shouldEmitPCReloc(GV)) {
2229     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2230     MI.eraseFromParent();
2231     return true;
2232   }
2233 
2234   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2235   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2236 
2237   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2238       MachinePointerInfo::getGOT(MF),
2239       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2240           MachineMemOperand::MOInvariant,
2241       8 /*Size*/, Align(8));
2242 
2243   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2244 
2245   if (Ty.getSizeInBits() == 32) {
2246     // Truncate if this is a 32-bit constant adrdess.
2247     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2248     B.buildExtract(DstReg, Load, 0);
2249   } else
2250     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2251 
2252   MI.eraseFromParent();
2253   return true;
2254 }
2255 
2256 bool AMDGPULegalizerInfo::legalizeLoad(
2257   MachineInstr &MI, MachineRegisterInfo &MRI,
2258   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2259   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2260   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2261   Observer.changingInstr(MI);
2262   MI.getOperand(1).setReg(Cast.getReg(0));
2263   Observer.changedInstr(MI);
2264   return true;
2265 }
2266 
2267 bool AMDGPULegalizerInfo::legalizeFMad(
2268   MachineInstr &MI, MachineRegisterInfo &MRI,
2269   MachineIRBuilder &B) const {
2270   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2271   assert(Ty.isScalar());
2272 
2273   MachineFunction &MF = B.getMF();
2274   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2275 
2276   // TODO: Always legal with future ftz flag.
2277   // FIXME: Do we need just output?
2278   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2279     return true;
2280   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2281     return true;
2282 
2283   MachineIRBuilder HelperBuilder(MI);
2284   GISelObserverWrapper DummyObserver;
2285   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2286   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2287 }
2288 
2289 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2290   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2291   Register DstReg = MI.getOperand(0).getReg();
2292   Register PtrReg = MI.getOperand(1).getReg();
2293   Register CmpVal = MI.getOperand(2).getReg();
2294   Register NewVal = MI.getOperand(3).getReg();
2295 
2296   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2297          "this should not have been custom lowered");
2298 
2299   LLT ValTy = MRI.getType(CmpVal);
2300   LLT VecTy = LLT::vector(2, ValTy);
2301 
2302   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2303 
2304   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2305     .addDef(DstReg)
2306     .addUse(PtrReg)
2307     .addUse(PackedVal)
2308     .setMemRefs(MI.memoperands());
2309 
2310   MI.eraseFromParent();
2311   return true;
2312 }
2313 
2314 bool AMDGPULegalizerInfo::legalizeFlog(
2315   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2316   Register Dst = MI.getOperand(0).getReg();
2317   Register Src = MI.getOperand(1).getReg();
2318   LLT Ty = B.getMRI()->getType(Dst);
2319   unsigned Flags = MI.getFlags();
2320 
2321   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2322   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2323 
2324   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2325   MI.eraseFromParent();
2326   return true;
2327 }
2328 
2329 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2330                                        MachineIRBuilder &B) const {
2331   Register Dst = MI.getOperand(0).getReg();
2332   Register Src = MI.getOperand(1).getReg();
2333   unsigned Flags = MI.getFlags();
2334   LLT Ty = B.getMRI()->getType(Dst);
2335 
2336   auto K = B.buildFConstant(Ty, numbers::log2e);
2337   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2338   B.buildFExp2(Dst, Mul, Flags);
2339   MI.eraseFromParent();
2340   return true;
2341 }
2342 
2343 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2344                                        MachineIRBuilder &B) const {
2345   Register Dst = MI.getOperand(0).getReg();
2346   Register Src0 = MI.getOperand(1).getReg();
2347   Register Src1 = MI.getOperand(2).getReg();
2348   unsigned Flags = MI.getFlags();
2349   LLT Ty = B.getMRI()->getType(Dst);
2350   const LLT S16 = LLT::scalar(16);
2351   const LLT S32 = LLT::scalar(32);
2352 
2353   if (Ty == S32) {
2354     auto Log = B.buildFLog2(S32, Src0, Flags);
2355     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2356       .addUse(Log.getReg(0))
2357       .addUse(Src1)
2358       .setMIFlags(Flags);
2359     B.buildFExp2(Dst, Mul, Flags);
2360   } else if (Ty == S16) {
2361     // There's no f16 fmul_legacy, so we need to convert for it.
2362     auto Log = B.buildFLog2(S16, Src0, Flags);
2363     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2364     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2365     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2366       .addUse(Ext0.getReg(0))
2367       .addUse(Ext1.getReg(0))
2368       .setMIFlags(Flags);
2369 
2370     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2371   } else
2372     return false;
2373 
2374   MI.eraseFromParent();
2375   return true;
2376 }
2377 
2378 // Find a source register, ignoring any possible source modifiers.
2379 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2380   Register ModSrc = OrigSrc;
2381   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2382     ModSrc = SrcFNeg->getOperand(1).getReg();
2383     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2384       ModSrc = SrcFAbs->getOperand(1).getReg();
2385   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2386     ModSrc = SrcFAbs->getOperand(1).getReg();
2387   return ModSrc;
2388 }
2389 
2390 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2391                                          MachineRegisterInfo &MRI,
2392                                          MachineIRBuilder &B) const {
2393 
2394   const LLT S1 = LLT::scalar(1);
2395   const LLT S64 = LLT::scalar(64);
2396   Register Dst = MI.getOperand(0).getReg();
2397   Register OrigSrc = MI.getOperand(1).getReg();
2398   unsigned Flags = MI.getFlags();
2399   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2400          "this should not have been custom lowered");
2401 
2402   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2403   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2404   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2405   // V_FRACT bug is:
2406   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2407   //
2408   // Convert floor(x) to (x - fract(x))
2409 
2410   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2411     .addUse(OrigSrc)
2412     .setMIFlags(Flags);
2413 
2414   // Give source modifier matching some assistance before obscuring a foldable
2415   // pattern.
2416 
2417   // TODO: We can avoid the neg on the fract? The input sign to fract
2418   // shouldn't matter?
2419   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2420 
2421   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2422 
2423   Register Min = MRI.createGenericVirtualRegister(S64);
2424 
2425   // We don't need to concern ourselves with the snan handling difference, so
2426   // use the one which will directly select.
2427   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2428   if (MFI->getMode().IEEE)
2429     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2430   else
2431     B.buildFMinNum(Min, Fract, Const, Flags);
2432 
2433   Register CorrectedFract = Min;
2434   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2435     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2436     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2437   }
2438 
2439   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2440   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2441 
2442   MI.eraseFromParent();
2443   return true;
2444 }
2445 
2446 // Turn an illegal packed v2s16 build vector into bit operations.
2447 // TODO: This should probably be a bitcast action in LegalizerHelper.
2448 bool AMDGPULegalizerInfo::legalizeBuildVector(
2449   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2450   Register Dst = MI.getOperand(0).getReg();
2451   const LLT S32 = LLT::scalar(32);
2452   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2453 
2454   Register Src0 = MI.getOperand(1).getReg();
2455   Register Src1 = MI.getOperand(2).getReg();
2456   assert(MRI.getType(Src0) == LLT::scalar(16));
2457 
2458   auto Merge = B.buildMerge(S32, {Src0, Src1});
2459   B.buildBitcast(Dst, Merge);
2460 
2461   MI.eraseFromParent();
2462   return true;
2463 }
2464 
2465 // Return the use branch instruction, otherwise null if the usage is invalid.
2466 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2467                                        MachineRegisterInfo &MRI,
2468                                        MachineInstr *&Br,
2469                                        MachineBasicBlock *&UncondBrTarget) {
2470   Register CondDef = MI.getOperand(0).getReg();
2471   if (!MRI.hasOneNonDBGUse(CondDef))
2472     return nullptr;
2473 
2474   MachineBasicBlock *Parent = MI.getParent();
2475   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2476   if (UseMI.getParent() != Parent ||
2477       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2478     return nullptr;
2479 
2480   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2481   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2482   if (Next == Parent->end()) {
2483     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2484     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2485       return nullptr;
2486     UncondBrTarget = &*NextMBB;
2487   } else {
2488     if (Next->getOpcode() != AMDGPU::G_BR)
2489       return nullptr;
2490     Br = &*Next;
2491     UncondBrTarget = Br->getOperand(0).getMBB();
2492   }
2493 
2494   return &UseMI;
2495 }
2496 
2497 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2498                                          const ArgDescriptor *Arg,
2499                                          const TargetRegisterClass *ArgRC,
2500                                          LLT ArgTy) const {
2501   MCRegister SrcReg = Arg->getRegister();
2502   assert(SrcReg.isPhysical() && "Physical register expected");
2503   assert(DstReg.isVirtual() && "Virtual register expected");
2504 
2505   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2506                                              ArgTy);
2507   if (Arg->isMasked()) {
2508     // TODO: Should we try to emit this once in the entry block?
2509     const LLT S32 = LLT::scalar(32);
2510     const unsigned Mask = Arg->getMask();
2511     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2512 
2513     Register AndMaskSrc = LiveIn;
2514 
2515     if (Shift != 0) {
2516       auto ShiftAmt = B.buildConstant(S32, Shift);
2517       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2518     }
2519 
2520     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2521   } else {
2522     B.buildCopy(DstReg, LiveIn);
2523   }
2524 
2525   return true;
2526 }
2527 
2528 bool AMDGPULegalizerInfo::loadInputValue(
2529     Register DstReg, MachineIRBuilder &B,
2530     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2531   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2532   const ArgDescriptor *Arg;
2533   const TargetRegisterClass *ArgRC;
2534   LLT ArgTy;
2535   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2536 
2537   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2538     return false; // TODO: Handle these
2539   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2540 }
2541 
2542 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2543     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2544     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2545   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2546     return false;
2547 
2548   MI.eraseFromParent();
2549   return true;
2550 }
2551 
2552 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2553                                        MachineRegisterInfo &MRI,
2554                                        MachineIRBuilder &B) const {
2555   Register Dst = MI.getOperand(0).getReg();
2556   LLT DstTy = MRI.getType(Dst);
2557   LLT S16 = LLT::scalar(16);
2558   LLT S32 = LLT::scalar(32);
2559   LLT S64 = LLT::scalar(64);
2560 
2561   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2562     return true;
2563 
2564   if (DstTy == S16)
2565     return legalizeFDIV16(MI, MRI, B);
2566   if (DstTy == S32)
2567     return legalizeFDIV32(MI, MRI, B);
2568   if (DstTy == S64)
2569     return legalizeFDIV64(MI, MRI, B);
2570 
2571   return false;
2572 }
2573 
2574 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2575                                                   Register DstReg,
2576                                                   Register X,
2577                                                   Register Y,
2578                                                   bool IsDiv) const {
2579   const LLT S1 = LLT::scalar(1);
2580   const LLT S32 = LLT::scalar(32);
2581 
2582   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2583   // algorithm used here.
2584 
2585   // Initial estimate of inv(y).
2586   auto FloatY = B.buildUITOFP(S32, Y);
2587   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2588   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2589   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2590   auto Z = B.buildFPTOUI(S32, ScaledY);
2591 
2592   // One round of UNR.
2593   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2594   auto NegYZ = B.buildMul(S32, NegY, Z);
2595   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2596 
2597   // Quotient/remainder estimate.
2598   auto Q = B.buildUMulH(S32, X, Z);
2599   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2600 
2601   // First quotient/remainder refinement.
2602   auto One = B.buildConstant(S32, 1);
2603   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2604   if (IsDiv)
2605     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2606   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2607 
2608   // Second quotient/remainder refinement.
2609   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2610   if (IsDiv)
2611     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2612   else
2613     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2614 }
2615 
2616 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2617                                               MachineRegisterInfo &MRI,
2618                                               MachineIRBuilder &B) const {
2619   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2620   Register DstReg = MI.getOperand(0).getReg();
2621   Register Num = MI.getOperand(1).getReg();
2622   Register Den = MI.getOperand(2).getReg();
2623   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2624   MI.eraseFromParent();
2625   return true;
2626 }
2627 
2628 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2629 //
2630 // Return lo, hi of result
2631 //
2632 // %cvt.lo = G_UITOFP Val.lo
2633 // %cvt.hi = G_UITOFP Val.hi
2634 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2635 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2636 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2637 // %mul2 = G_FMUL %mul1, 2**(-32)
2638 // %trunc = G_INTRINSIC_TRUNC %mul2
2639 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2640 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2641 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2642                                                        Register Val) {
2643   const LLT S32 = LLT::scalar(32);
2644   auto Unmerge = B.buildUnmerge(S32, Val);
2645 
2646   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2647   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2648 
2649   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2650                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2651 
2652   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2653   auto Mul1 =
2654       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2655 
2656   // 2**(-32)
2657   auto Mul2 =
2658       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2659   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2660 
2661   // -(2**32)
2662   auto Mad2 = B.buildFMAD(S32, Trunc,
2663                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2664 
2665   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2666   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2667 
2668   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2669 }
2670 
2671 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2672                                                   Register DstReg,
2673                                                   Register Numer,
2674                                                   Register Denom,
2675                                                   bool IsDiv) const {
2676   const LLT S32 = LLT::scalar(32);
2677   const LLT S64 = LLT::scalar(64);
2678   const LLT S1 = LLT::scalar(1);
2679   Register RcpLo, RcpHi;
2680 
2681   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2682 
2683   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2684 
2685   auto Zero64 = B.buildConstant(S64, 0);
2686   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2687 
2688   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2689   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2690 
2691   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2692   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2693   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2694 
2695   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2696   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2697   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2698   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2699 
2700   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2701   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2702   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2703   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2704   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2705 
2706   auto Zero32 = B.buildConstant(S32, 0);
2707   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2708   auto Add2_HiC =
2709       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2710   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2711   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2712 
2713   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2714   Register NumerLo = UnmergeNumer.getReg(0);
2715   Register NumerHi = UnmergeNumer.getReg(1);
2716 
2717   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2718   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2719   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2720   Register Mul3_Lo = UnmergeMul3.getReg(0);
2721   Register Mul3_Hi = UnmergeMul3.getReg(1);
2722   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2723   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2724   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2725   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2726 
2727   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2728   Register DenomLo = UnmergeDenom.getReg(0);
2729   Register DenomHi = UnmergeDenom.getReg(1);
2730 
2731   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2732   auto C1 = B.buildSExt(S32, CmpHi);
2733 
2734   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2735   auto C2 = B.buildSExt(S32, CmpLo);
2736 
2737   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2738   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2739 
2740   // TODO: Here and below portions of the code can be enclosed into if/endif.
2741   // Currently control flow is unconditional and we have 4 selects after
2742   // potential endif to substitute PHIs.
2743 
2744   // if C3 != 0 ...
2745   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2746   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2747   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2748   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2749 
2750   auto One64 = B.buildConstant(S64, 1);
2751   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2752 
2753   auto C4 =
2754       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2755   auto C5 =
2756       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2757   auto C6 = B.buildSelect(
2758       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2759 
2760   // if (C6 != 0)
2761   auto Add4 = B.buildAdd(S64, Add3, One64);
2762   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2763 
2764   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2765   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2766   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2767 
2768   // endif C6
2769   // endif C3
2770 
2771   if (IsDiv) {
2772     auto Sel1 = B.buildSelect(
2773         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2774     B.buildSelect(DstReg,
2775                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2776   } else {
2777     auto Sel2 = B.buildSelect(
2778         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2779     B.buildSelect(DstReg,
2780                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2781   }
2782 }
2783 
2784 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2785                                             MachineRegisterInfo &MRI,
2786                                             MachineIRBuilder &B) const {
2787   const LLT S64 = LLT::scalar(64);
2788   const LLT S32 = LLT::scalar(32);
2789   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2790   Register DstReg = MI.getOperand(0).getReg();
2791   Register Num = MI.getOperand(1).getReg();
2792   Register Den = MI.getOperand(2).getReg();
2793   LLT Ty = MRI.getType(DstReg);
2794 
2795   if (Ty == S32)
2796     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2797   else if (Ty == S64)
2798     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2799   else
2800     return false;
2801 
2802   MI.eraseFromParent();
2803   return true;
2804 
2805 }
2806 
2807 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2808                                             MachineRegisterInfo &MRI,
2809                                             MachineIRBuilder &B) const {
2810   const LLT S64 = LLT::scalar(64);
2811   const LLT S32 = LLT::scalar(32);
2812 
2813   Register DstReg = MI.getOperand(0).getReg();
2814   const LLT Ty = MRI.getType(DstReg);
2815   if (Ty != S32 && Ty != S64)
2816     return false;
2817 
2818   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2819 
2820   Register LHS = MI.getOperand(1).getReg();
2821   Register RHS = MI.getOperand(2).getReg();
2822 
2823   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2824   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2825   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2826 
2827   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2828   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2829 
2830   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2831   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2832 
2833   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2834   if (Ty == S32)
2835     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2836   else
2837     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2838 
2839   Register Sign;
2840   if (IsDiv)
2841     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2842   else
2843     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2844 
2845   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2846   B.buildSub(DstReg, UDivRem, Sign);
2847 
2848   MI.eraseFromParent();
2849   return true;
2850 }
2851 
2852 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2853                                                  MachineRegisterInfo &MRI,
2854                                                  MachineIRBuilder &B) const {
2855   Register Res = MI.getOperand(0).getReg();
2856   Register LHS = MI.getOperand(1).getReg();
2857   Register RHS = MI.getOperand(2).getReg();
2858 
2859   uint16_t Flags = MI.getFlags();
2860 
2861   LLT ResTy = MRI.getType(Res);
2862   LLT S32 = LLT::scalar(32);
2863   LLT S64 = LLT::scalar(64);
2864 
2865   const MachineFunction &MF = B.getMF();
2866   bool Unsafe =
2867     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2868 
2869   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2870     return false;
2871 
2872   if (!Unsafe && ResTy == S32 &&
2873       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2874     return false;
2875 
2876   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2877     // 1 / x -> RCP(x)
2878     if (CLHS->isExactlyValue(1.0)) {
2879       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2880         .addUse(RHS)
2881         .setMIFlags(Flags);
2882 
2883       MI.eraseFromParent();
2884       return true;
2885     }
2886 
2887     // -1 / x -> RCP( FNEG(x) )
2888     if (CLHS->isExactlyValue(-1.0)) {
2889       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2890       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2891         .addUse(FNeg.getReg(0))
2892         .setMIFlags(Flags);
2893 
2894       MI.eraseFromParent();
2895       return true;
2896     }
2897   }
2898 
2899   // x / y -> x * (1.0 / y)
2900   if (Unsafe) {
2901     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2902       .addUse(RHS)
2903       .setMIFlags(Flags);
2904     B.buildFMul(Res, LHS, RCP, Flags);
2905 
2906     MI.eraseFromParent();
2907     return true;
2908   }
2909 
2910   return false;
2911 }
2912 
2913 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2914                                          MachineRegisterInfo &MRI,
2915                                          MachineIRBuilder &B) const {
2916   Register Res = MI.getOperand(0).getReg();
2917   Register LHS = MI.getOperand(1).getReg();
2918   Register RHS = MI.getOperand(2).getReg();
2919 
2920   uint16_t Flags = MI.getFlags();
2921 
2922   LLT S16 = LLT::scalar(16);
2923   LLT S32 = LLT::scalar(32);
2924 
2925   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2926   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2927 
2928   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2929     .addUse(RHSExt.getReg(0))
2930     .setMIFlags(Flags);
2931 
2932   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2933   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2934 
2935   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2936     .addUse(RDst.getReg(0))
2937     .addUse(RHS)
2938     .addUse(LHS)
2939     .setMIFlags(Flags);
2940 
2941   MI.eraseFromParent();
2942   return true;
2943 }
2944 
2945 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2946 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2947 static void toggleSPDenormMode(bool Enable,
2948                                MachineIRBuilder &B,
2949                                const GCNSubtarget &ST,
2950                                AMDGPU::SIModeRegisterDefaults Mode) {
2951   // Set SP denorm mode to this value.
2952   unsigned SPDenormMode =
2953     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2954 
2955   if (ST.hasDenormModeInst()) {
2956     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2957     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2958 
2959     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2960     B.buildInstr(AMDGPU::S_DENORM_MODE)
2961       .addImm(NewDenormModeValue);
2962 
2963   } else {
2964     // Select FP32 bit field in mode register.
2965     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2966                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2967                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2968 
2969     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2970       .addImm(SPDenormMode)
2971       .addImm(SPDenormModeBitField);
2972   }
2973 }
2974 
2975 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2976                                          MachineRegisterInfo &MRI,
2977                                          MachineIRBuilder &B) const {
2978   Register Res = MI.getOperand(0).getReg();
2979   Register LHS = MI.getOperand(1).getReg();
2980   Register RHS = MI.getOperand(2).getReg();
2981   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2982   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2983 
2984   uint16_t Flags = MI.getFlags();
2985 
2986   LLT S32 = LLT::scalar(32);
2987   LLT S1 = LLT::scalar(1);
2988 
2989   auto One = B.buildFConstant(S32, 1.0f);
2990 
2991   auto DenominatorScaled =
2992     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2993       .addUse(LHS)
2994       .addUse(RHS)
2995       .addImm(0)
2996       .setMIFlags(Flags);
2997   auto NumeratorScaled =
2998     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2999       .addUse(LHS)
3000       .addUse(RHS)
3001       .addImm(1)
3002       .setMIFlags(Flags);
3003 
3004   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3005     .addUse(DenominatorScaled.getReg(0))
3006     .setMIFlags(Flags);
3007   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3008 
3009   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3010   // aren't modeled as reading it.
3011   if (!Mode.allFP32Denormals())
3012     toggleSPDenormMode(true, B, ST, Mode);
3013 
3014   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3015   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3016   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3017   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3018   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3019   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3020 
3021   if (!Mode.allFP32Denormals())
3022     toggleSPDenormMode(false, B, ST, Mode);
3023 
3024   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3025     .addUse(Fma4.getReg(0))
3026     .addUse(Fma1.getReg(0))
3027     .addUse(Fma3.getReg(0))
3028     .addUse(NumeratorScaled.getReg(1))
3029     .setMIFlags(Flags);
3030 
3031   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3032     .addUse(Fmas.getReg(0))
3033     .addUse(RHS)
3034     .addUse(LHS)
3035     .setMIFlags(Flags);
3036 
3037   MI.eraseFromParent();
3038   return true;
3039 }
3040 
3041 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3042                                          MachineRegisterInfo &MRI,
3043                                          MachineIRBuilder &B) const {
3044   Register Res = MI.getOperand(0).getReg();
3045   Register LHS = MI.getOperand(1).getReg();
3046   Register RHS = MI.getOperand(2).getReg();
3047 
3048   uint16_t Flags = MI.getFlags();
3049 
3050   LLT S64 = LLT::scalar(64);
3051   LLT S1 = LLT::scalar(1);
3052 
3053   auto One = B.buildFConstant(S64, 1.0);
3054 
3055   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3056     .addUse(LHS)
3057     .addUse(RHS)
3058     .addImm(0)
3059     .setMIFlags(Flags);
3060 
3061   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3062 
3063   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3064     .addUse(DivScale0.getReg(0))
3065     .setMIFlags(Flags);
3066 
3067   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3068   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3069   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3070 
3071   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3072     .addUse(LHS)
3073     .addUse(RHS)
3074     .addImm(1)
3075     .setMIFlags(Flags);
3076 
3077   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3078   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3079   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3080 
3081   Register Scale;
3082   if (!ST.hasUsableDivScaleConditionOutput()) {
3083     // Workaround a hardware bug on SI where the condition output from div_scale
3084     // is not usable.
3085 
3086     LLT S32 = LLT::scalar(32);
3087 
3088     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3089     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3090     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3091     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3092 
3093     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3094                               Scale1Unmerge.getReg(1));
3095     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3096                               Scale0Unmerge.getReg(1));
3097     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3098   } else {
3099     Scale = DivScale1.getReg(1);
3100   }
3101 
3102   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3103     .addUse(Fma4.getReg(0))
3104     .addUse(Fma3.getReg(0))
3105     .addUse(Mul.getReg(0))
3106     .addUse(Scale)
3107     .setMIFlags(Flags);
3108 
3109   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3110     .addUse(Fmas.getReg(0))
3111     .addUse(RHS)
3112     .addUse(LHS)
3113     .setMIFlags(Flags);
3114 
3115   MI.eraseFromParent();
3116   return true;
3117 }
3118 
3119 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3120                                                  MachineRegisterInfo &MRI,
3121                                                  MachineIRBuilder &B) const {
3122   Register Res = MI.getOperand(0).getReg();
3123   Register LHS = MI.getOperand(2).getReg();
3124   Register RHS = MI.getOperand(3).getReg();
3125   uint16_t Flags = MI.getFlags();
3126 
3127   LLT S32 = LLT::scalar(32);
3128   LLT S1 = LLT::scalar(1);
3129 
3130   auto Abs = B.buildFAbs(S32, RHS, Flags);
3131   const APFloat C0Val(1.0f);
3132 
3133   auto C0 = B.buildConstant(S32, 0x6f800000);
3134   auto C1 = B.buildConstant(S32, 0x2f800000);
3135   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3136 
3137   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3138   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3139 
3140   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3141 
3142   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3143     .addUse(Mul0.getReg(0))
3144     .setMIFlags(Flags);
3145 
3146   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3147 
3148   B.buildFMul(Res, Sel, Mul1, Flags);
3149 
3150   MI.eraseFromParent();
3151   return true;
3152 }
3153 
3154 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3155                                             MachineRegisterInfo &MRI,
3156                                             MachineIRBuilder &B) const {
3157   uint64_t Offset =
3158     ST.getTargetLowering()->getImplicitParameterOffset(
3159       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3160   LLT DstTy = MRI.getType(DstReg);
3161   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3162 
3163   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3164   if (!loadInputValue(KernargPtrReg, B,
3165                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3166     return false;
3167 
3168   // FIXME: This should be nuw
3169   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3170   return true;
3171 }
3172 
3173 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3174                                                  MachineRegisterInfo &MRI,
3175                                                  MachineIRBuilder &B) const {
3176   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3177   if (!MFI->isEntryFunction()) {
3178     return legalizePreloadedArgIntrin(MI, MRI, B,
3179                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3180   }
3181 
3182   Register DstReg = MI.getOperand(0).getReg();
3183   if (!getImplicitArgPtr(DstReg, MRI, B))
3184     return false;
3185 
3186   MI.eraseFromParent();
3187   return true;
3188 }
3189 
3190 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3191                                               MachineRegisterInfo &MRI,
3192                                               MachineIRBuilder &B,
3193                                               unsigned AddrSpace) const {
3194   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3195   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3196   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3197   MI.eraseFromParent();
3198   return true;
3199 }
3200 
3201 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3202 // offset (the offset that is included in bounds checking and swizzling, to be
3203 // split between the instruction's voffset and immoffset fields) and soffset
3204 // (the offset that is excluded from bounds checking and swizzling, to go in
3205 // the instruction's soffset field).  This function takes the first kind of
3206 // offset and figures out how to split it between voffset and immoffset.
3207 std::tuple<Register, unsigned, unsigned>
3208 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3209                                         Register OrigOffset) const {
3210   const unsigned MaxImm = 4095;
3211   Register BaseReg;
3212   unsigned TotalConstOffset;
3213   MachineInstr *OffsetDef;
3214   const LLT S32 = LLT::scalar(32);
3215 
3216   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3217     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3218 
3219   unsigned ImmOffset = TotalConstOffset;
3220 
3221   // If the immediate value is too big for the immoffset field, put the value
3222   // and -4096 into the immoffset field so that the value that is copied/added
3223   // for the voffset field is a multiple of 4096, and it stands more chance
3224   // of being CSEd with the copy/add for another similar load/store.
3225   // However, do not do that rounding down to a multiple of 4096 if that is a
3226   // negative number, as it appears to be illegal to have a negative offset
3227   // in the vgpr, even if adding the immediate offset makes it positive.
3228   unsigned Overflow = ImmOffset & ~MaxImm;
3229   ImmOffset -= Overflow;
3230   if ((int32_t)Overflow < 0) {
3231     Overflow += ImmOffset;
3232     ImmOffset = 0;
3233   }
3234 
3235   if (Overflow != 0) {
3236     if (!BaseReg) {
3237       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3238     } else {
3239       auto OverflowVal = B.buildConstant(S32, Overflow);
3240       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3241     }
3242   }
3243 
3244   if (!BaseReg)
3245     BaseReg = B.buildConstant(S32, 0).getReg(0);
3246 
3247   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3248 }
3249 
3250 /// Handle register layout difference for f16 images for some subtargets.
3251 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3252                                              MachineRegisterInfo &MRI,
3253                                              Register Reg) const {
3254   if (!ST.hasUnpackedD16VMem())
3255     return Reg;
3256 
3257   const LLT S16 = LLT::scalar(16);
3258   const LLT S32 = LLT::scalar(32);
3259   LLT StoreVT = MRI.getType(Reg);
3260   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3261 
3262   auto Unmerge = B.buildUnmerge(S16, Reg);
3263 
3264   SmallVector<Register, 4> WideRegs;
3265   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3266     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3267 
3268   int NumElts = StoreVT.getNumElements();
3269 
3270   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3271 }
3272 
3273 Register AMDGPULegalizerInfo::fixStoreSourceType(
3274   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3275   MachineRegisterInfo *MRI = B.getMRI();
3276   LLT Ty = MRI->getType(VData);
3277 
3278   const LLT S16 = LLT::scalar(16);
3279 
3280   // Fixup illegal register types for i8 stores.
3281   if (Ty == LLT::scalar(8) || Ty == S16) {
3282     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3283     return AnyExt;
3284   }
3285 
3286   if (Ty.isVector()) {
3287     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3288       if (IsFormat)
3289         return handleD16VData(B, *MRI, VData);
3290     }
3291   }
3292 
3293   return VData;
3294 }
3295 
3296 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3297                                               MachineRegisterInfo &MRI,
3298                                               MachineIRBuilder &B,
3299                                               bool IsTyped,
3300                                               bool IsFormat) const {
3301   Register VData = MI.getOperand(1).getReg();
3302   LLT Ty = MRI.getType(VData);
3303   LLT EltTy = Ty.getScalarType();
3304   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3305   const LLT S32 = LLT::scalar(32);
3306 
3307   VData = fixStoreSourceType(B, VData, IsFormat);
3308   Register RSrc = MI.getOperand(2).getReg();
3309 
3310   MachineMemOperand *MMO = *MI.memoperands_begin();
3311   const int MemSize = MMO->getSize();
3312 
3313   unsigned ImmOffset;
3314   unsigned TotalOffset;
3315 
3316   // The typed intrinsics add an immediate after the registers.
3317   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3318 
3319   // The struct intrinsic variants add one additional operand over raw.
3320   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3321   Register VIndex;
3322   int OpOffset = 0;
3323   if (HasVIndex) {
3324     VIndex = MI.getOperand(3).getReg();
3325     OpOffset = 1;
3326   }
3327 
3328   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3329   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3330 
3331   unsigned Format = 0;
3332   if (IsTyped) {
3333     Format = MI.getOperand(5 + OpOffset).getImm();
3334     ++OpOffset;
3335   }
3336 
3337   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3338 
3339   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3340   if (TotalOffset != 0)
3341     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3342 
3343   unsigned Opc;
3344   if (IsTyped) {
3345     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3346                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3347   } else if (IsFormat) {
3348     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3349                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3350   } else {
3351     switch (MemSize) {
3352     case 1:
3353       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3354       break;
3355     case 2:
3356       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3357       break;
3358     default:
3359       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3360       break;
3361     }
3362   }
3363 
3364   if (!VIndex)
3365     VIndex = B.buildConstant(S32, 0).getReg(0);
3366 
3367   auto MIB = B.buildInstr(Opc)
3368     .addUse(VData)              // vdata
3369     .addUse(RSrc)               // rsrc
3370     .addUse(VIndex)             // vindex
3371     .addUse(VOffset)            // voffset
3372     .addUse(SOffset)            // soffset
3373     .addImm(ImmOffset);         // offset(imm)
3374 
3375   if (IsTyped)
3376     MIB.addImm(Format);
3377 
3378   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3379      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3380      .addMemOperand(MMO);
3381 
3382   MI.eraseFromParent();
3383   return true;
3384 }
3385 
3386 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3387                                              MachineRegisterInfo &MRI,
3388                                              MachineIRBuilder &B,
3389                                              bool IsFormat,
3390                                              bool IsTyped) const {
3391   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3392   MachineMemOperand *MMO = *MI.memoperands_begin();
3393   const int MemSize = MMO->getSize();
3394   const LLT S32 = LLT::scalar(32);
3395 
3396   Register Dst = MI.getOperand(0).getReg();
3397   Register RSrc = MI.getOperand(2).getReg();
3398 
3399   // The typed intrinsics add an immediate after the registers.
3400   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3401 
3402   // The struct intrinsic variants add one additional operand over raw.
3403   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3404   Register VIndex;
3405   int OpOffset = 0;
3406   if (HasVIndex) {
3407     VIndex = MI.getOperand(3).getReg();
3408     OpOffset = 1;
3409   }
3410 
3411   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3412   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3413 
3414   unsigned Format = 0;
3415   if (IsTyped) {
3416     Format = MI.getOperand(5 + OpOffset).getImm();
3417     ++OpOffset;
3418   }
3419 
3420   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3421   unsigned ImmOffset;
3422   unsigned TotalOffset;
3423 
3424   LLT Ty = MRI.getType(Dst);
3425   LLT EltTy = Ty.getScalarType();
3426   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3427   const bool Unpacked = ST.hasUnpackedD16VMem();
3428 
3429   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3430   if (TotalOffset != 0)
3431     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3432 
3433   unsigned Opc;
3434 
3435   if (IsTyped) {
3436     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3437                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3438   } else if (IsFormat) {
3439     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3440                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3441   } else {
3442     switch (MemSize) {
3443     case 1:
3444       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3445       break;
3446     case 2:
3447       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3448       break;
3449     default:
3450       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3451       break;
3452     }
3453   }
3454 
3455   Register LoadDstReg;
3456 
3457   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3458   LLT UnpackedTy = Ty.changeElementSize(32);
3459 
3460   if (IsExtLoad)
3461     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3462   else if (Unpacked && IsD16 && Ty.isVector())
3463     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3464   else
3465     LoadDstReg = Dst;
3466 
3467   if (!VIndex)
3468     VIndex = B.buildConstant(S32, 0).getReg(0);
3469 
3470   auto MIB = B.buildInstr(Opc)
3471     .addDef(LoadDstReg)         // vdata
3472     .addUse(RSrc)               // rsrc
3473     .addUse(VIndex)             // vindex
3474     .addUse(VOffset)            // voffset
3475     .addUse(SOffset)            // soffset
3476     .addImm(ImmOffset);         // offset(imm)
3477 
3478   if (IsTyped)
3479     MIB.addImm(Format);
3480 
3481   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3482      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3483      .addMemOperand(MMO);
3484 
3485   if (LoadDstReg != Dst) {
3486     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3487 
3488     // Widen result for extending loads was widened.
3489     if (IsExtLoad)
3490       B.buildTrunc(Dst, LoadDstReg);
3491     else {
3492       // Repack to original 16-bit vector result
3493       // FIXME: G_TRUNC should work, but legalization currently fails
3494       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3495       SmallVector<Register, 4> Repack;
3496       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3497         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3498       B.buildMerge(Dst, Repack);
3499     }
3500   }
3501 
3502   MI.eraseFromParent();
3503   return true;
3504 }
3505 
3506 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3507                                                MachineIRBuilder &B,
3508                                                bool IsInc) const {
3509   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3510                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3511   B.buildInstr(Opc)
3512     .addDef(MI.getOperand(0).getReg())
3513     .addUse(MI.getOperand(2).getReg())
3514     .addUse(MI.getOperand(3).getReg())
3515     .cloneMemRefs(MI);
3516   MI.eraseFromParent();
3517   return true;
3518 }
3519 
3520 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3521   switch (IntrID) {
3522   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3523   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3524     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3525   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3526   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3527     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3528   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3529   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3530     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3531   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3532   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3533     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3534   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3535   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3536     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3537   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3538   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3539     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3540   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3541   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3542     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3543   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3544   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3545     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3546   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3547   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3548     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3549   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3550   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3551     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3552   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3553   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3554     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3555   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3556   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3557     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3558   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3559   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3560     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3561   default:
3562     llvm_unreachable("unhandled atomic opcode");
3563   }
3564 }
3565 
3566 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3567                                                MachineIRBuilder &B,
3568                                                Intrinsic::ID IID) const {
3569   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3570                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3571 
3572   Register Dst = MI.getOperand(0).getReg();
3573   Register VData = MI.getOperand(2).getReg();
3574 
3575   Register CmpVal;
3576   int OpOffset = 0;
3577 
3578   if (IsCmpSwap) {
3579     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3580     ++OpOffset;
3581   }
3582 
3583   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3584   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3585 
3586   // The struct intrinsic variants add one additional operand over raw.
3587   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3588   Register VIndex;
3589   if (HasVIndex) {
3590     VIndex = MI.getOperand(4 + OpOffset).getReg();
3591     ++OpOffset;
3592   }
3593 
3594   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3595   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3596   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3597 
3598   MachineMemOperand *MMO = *MI.memoperands_begin();
3599 
3600   unsigned ImmOffset;
3601   unsigned TotalOffset;
3602   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3603   if (TotalOffset != 0)
3604     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3605 
3606   if (!VIndex)
3607     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3608 
3609   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3610     .addDef(Dst)
3611     .addUse(VData); // vdata
3612 
3613   if (IsCmpSwap)
3614     MIB.addReg(CmpVal);
3615 
3616   MIB.addUse(RSrc)               // rsrc
3617      .addUse(VIndex)             // vindex
3618      .addUse(VOffset)            // voffset
3619      .addUse(SOffset)            // soffset
3620      .addImm(ImmOffset)          // offset(imm)
3621      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3622      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3623      .addMemOperand(MMO);
3624 
3625   MI.eraseFromParent();
3626   return true;
3627 }
3628 
3629 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3630 /// vector with s16 typed elements.
3631 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3632                                         SmallVectorImpl<Register> &PackedAddrs,
3633                                         int AddrIdx, int DimIdx, int EndIdx,
3634                                         int NumGradients) {
3635   const LLT S16 = LLT::scalar(16);
3636   const LLT V2S16 = LLT::vector(2, 16);
3637 
3638   for (int I = AddrIdx; I < EndIdx; ++I) {
3639     MachineOperand &SrcOp = MI.getOperand(I);
3640     if (!SrcOp.isReg())
3641       continue; // _L to _LZ may have eliminated this.
3642 
3643     Register AddrReg = SrcOp.getReg();
3644 
3645     if (I < DimIdx) {
3646       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3647       PackedAddrs.push_back(AddrReg);
3648     } else {
3649       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3650       // derivatives dx/dh and dx/dv are packed with undef.
3651       if (((I + 1) >= EndIdx) ||
3652           ((NumGradients / 2) % 2 == 1 &&
3653            (I == DimIdx + (NumGradients / 2) - 1 ||
3654             I == DimIdx + NumGradients - 1)) ||
3655           // Check for _L to _LZ optimization
3656           !MI.getOperand(I + 1).isReg()) {
3657         PackedAddrs.push_back(
3658             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3659                 .getReg(0));
3660       } else {
3661         PackedAddrs.push_back(
3662             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3663                 .getReg(0));
3664         ++I;
3665       }
3666     }
3667   }
3668 }
3669 
3670 /// Convert from separate vaddr components to a single vector address register,
3671 /// and replace the remaining operands with $noreg.
3672 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3673                                      int DimIdx, int NumVAddrs) {
3674   const LLT S32 = LLT::scalar(32);
3675 
3676   SmallVector<Register, 8> AddrRegs;
3677   for (int I = 0; I != NumVAddrs; ++I) {
3678     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3679     if (SrcOp.isReg()) {
3680       AddrRegs.push_back(SrcOp.getReg());
3681       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3682     }
3683   }
3684 
3685   int NumAddrRegs = AddrRegs.size();
3686   if (NumAddrRegs != 1) {
3687     // Round up to 8 elements for v5-v7
3688     // FIXME: Missing intermediate sized register classes and instructions.
3689     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3690       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3691       auto Undef = B.buildUndef(S32);
3692       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3693       NumAddrRegs = RoundedNumRegs;
3694     }
3695 
3696     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3697     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3698   }
3699 
3700   for (int I = 1; I != NumVAddrs; ++I) {
3701     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3702     if (SrcOp.isReg())
3703       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3704   }
3705 }
3706 
3707 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3708 ///
3709 /// Depending on the subtarget, load/store with 16-bit element data need to be
3710 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3711 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3712 /// registers.
3713 ///
3714 /// We don't want to directly select image instructions just yet, but also want
3715 /// to exposes all register repacking to the legalizer/combiners. We also don't
3716 /// want a selected instrution entering RegBankSelect. In order to avoid
3717 /// defining a multitude of intermediate image instructions, directly hack on
3718 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3719 /// now unnecessary arguments with $noreg.
3720 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3721     MachineInstr &MI, MachineIRBuilder &B,
3722     GISelChangeObserver &Observer,
3723     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3724 
3725   const int NumDefs = MI.getNumExplicitDefs();
3726   bool IsTFE = NumDefs == 2;
3727   // We are only processing the operands of d16 image operations on subtargets
3728   // that use the unpacked register layout, or need to repack the TFE result.
3729 
3730   // TODO: Do we need to guard against already legalized intrinsics?
3731   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3732     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3733 
3734   MachineRegisterInfo *MRI = B.getMRI();
3735   const LLT S32 = LLT::scalar(32);
3736   const LLT S16 = LLT::scalar(16);
3737   const LLT V2S16 = LLT::vector(2, 16);
3738 
3739   // Index of first address argument
3740   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3741 
3742   int NumVAddrs, NumGradients;
3743   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3744   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3745     getDMaskIdx(BaseOpcode, NumDefs);
3746   unsigned DMask = 0;
3747 
3748   // Check for 16 bit addresses and pack if true.
3749   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3750   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3751   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3752   const bool IsG16 = GradTy == S16;
3753   const bool IsA16 = AddrTy == S16;
3754 
3755   int DMaskLanes = 0;
3756   if (!BaseOpcode->Atomic) {
3757     DMask = MI.getOperand(DMaskIdx).getImm();
3758     if (BaseOpcode->Gather4) {
3759       DMaskLanes = 4;
3760     } else if (DMask != 0) {
3761       DMaskLanes = countPopulation(DMask);
3762     } else if (!IsTFE && !BaseOpcode->Store) {
3763       // If dmask is 0, this is a no-op load. This can be eliminated.
3764       B.buildUndef(MI.getOperand(0));
3765       MI.eraseFromParent();
3766       return true;
3767     }
3768   }
3769 
3770   Observer.changingInstr(MI);
3771   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3772 
3773   unsigned NewOpcode = NumDefs == 0 ?
3774     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3775 
3776   // Track that we legalized this
3777   MI.setDesc(B.getTII().get(NewOpcode));
3778 
3779   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3780   // dmask to be at least 1 otherwise the instruction will fail
3781   if (IsTFE && DMask == 0) {
3782     DMask = 0x1;
3783     DMaskLanes = 1;
3784     MI.getOperand(DMaskIdx).setImm(DMask);
3785   }
3786 
3787   if (BaseOpcode->Atomic) {
3788     Register VData0 = MI.getOperand(2).getReg();
3789     LLT Ty = MRI->getType(VData0);
3790 
3791     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3792     if (Ty.isVector())
3793       return false;
3794 
3795     if (BaseOpcode->AtomicX2) {
3796       Register VData1 = MI.getOperand(3).getReg();
3797       // The two values are packed in one register.
3798       LLT PackedTy = LLT::vector(2, Ty);
3799       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3800       MI.getOperand(2).setReg(Concat.getReg(0));
3801       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3802     }
3803   }
3804 
3805   int CorrectedNumVAddrs = NumVAddrs;
3806 
3807   // Optimize _L to _LZ when _L is zero
3808   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3809         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3810     const ConstantFP *ConstantLod;
3811     const int LodIdx = AddrIdx + NumVAddrs - 1;
3812 
3813     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3814       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3815         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3816         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3817           LZMappingInfo->LZ, ImageDimIntr->Dim);
3818 
3819         // The starting indexes should remain in the same place.
3820         --NumVAddrs;
3821         --CorrectedNumVAddrs;
3822 
3823         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3824           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3825         MI.RemoveOperand(LodIdx);
3826       }
3827     }
3828   }
3829 
3830   // Optimize _mip away, when 'lod' is zero
3831   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3832     int64_t ConstantLod;
3833     const int LodIdx = AddrIdx + NumVAddrs - 1;
3834 
3835     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3836       if (ConstantLod == 0) {
3837         // TODO: Change intrinsic opcode and remove operand instead or replacing
3838         // it with 0, as the _L to _LZ handling is done above.
3839         MI.getOperand(LodIdx).ChangeToImmediate(0);
3840         --CorrectedNumVAddrs;
3841       }
3842     }
3843   }
3844 
3845   // Rewrite the addressing register layout before doing anything else.
3846   if (IsA16 || IsG16) {
3847     if (IsA16) {
3848       // Target must support the feature and gradients need to be 16 bit too
3849       if (!ST.hasA16() || !IsG16)
3850         return false;
3851     } else if (!ST.hasG16())
3852       return false;
3853 
3854     if (NumVAddrs > 1) {
3855       SmallVector<Register, 4> PackedRegs;
3856       // Don't compress addresses for G16
3857       const int PackEndIdx =
3858           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3859       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3860                                   PackEndIdx, NumGradients);
3861 
3862       if (!IsA16) {
3863         // Add uncompressed address
3864         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3865           int AddrReg = MI.getOperand(I).getReg();
3866           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3867           PackedRegs.push_back(AddrReg);
3868         }
3869       }
3870 
3871       // See also below in the non-a16 branch
3872       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3873 
3874       if (!UseNSA && PackedRegs.size() > 1) {
3875         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3876         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3877         PackedRegs[0] = Concat.getReg(0);
3878         PackedRegs.resize(1);
3879       }
3880 
3881       const int NumPacked = PackedRegs.size();
3882       for (int I = 0; I != NumVAddrs; ++I) {
3883         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3884         if (!SrcOp.isReg()) {
3885           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3886           continue;
3887         }
3888 
3889         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3890 
3891         if (I < NumPacked)
3892           SrcOp.setReg(PackedRegs[I]);
3893         else
3894           SrcOp.setReg(AMDGPU::NoRegister);
3895       }
3896     }
3897   } else {
3898     // If the register allocator cannot place the address registers contiguously
3899     // without introducing moves, then using the non-sequential address encoding
3900     // is always preferable, since it saves VALU instructions and is usually a
3901     // wash in terms of code size or even better.
3902     //
3903     // However, we currently have no way of hinting to the register allocator
3904     // that MIMG addresses should be placed contiguously when it is possible to
3905     // do so, so force non-NSA for the common 2-address case as a heuristic.
3906     //
3907     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3908     // allocation when possible.
3909     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3910 
3911     if (!UseNSA && NumVAddrs > 1)
3912       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3913   }
3914 
3915   int Flags = 0;
3916   if (IsA16)
3917     Flags |= 1;
3918   if (IsG16)
3919     Flags |= 2;
3920   MI.addOperand(MachineOperand::CreateImm(Flags));
3921 
3922   if (BaseOpcode->Store) { // No TFE for stores?
3923     // TODO: Handle dmask trim
3924     Register VData = MI.getOperand(1).getReg();
3925     LLT Ty = MRI->getType(VData);
3926     if (!Ty.isVector() || Ty.getElementType() != S16)
3927       return true;
3928 
3929     Register RepackedReg = handleD16VData(B, *MRI, VData);
3930     if (RepackedReg != VData) {
3931       MI.getOperand(1).setReg(RepackedReg);
3932     }
3933 
3934     return true;
3935   }
3936 
3937   Register DstReg = MI.getOperand(0).getReg();
3938   LLT Ty = MRI->getType(DstReg);
3939   const LLT EltTy = Ty.getScalarType();
3940   const bool IsD16 = Ty.getScalarType() == S16;
3941   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3942 
3943   // Confirm that the return type is large enough for the dmask specified
3944   if (NumElts < DMaskLanes)
3945     return false;
3946 
3947   if (NumElts > 4 || DMaskLanes > 4)
3948     return false;
3949 
3950   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3951   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3952 
3953   // The raw dword aligned data component of the load. The only legal cases
3954   // where this matters should be when using the packed D16 format, for
3955   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3956   LLT RoundedTy;
3957 
3958   // S32 vector to to cover all data, plus TFE result element.
3959   LLT TFETy;
3960 
3961   // Register type to use for each loaded component. Will be S32 or V2S16.
3962   LLT RegTy;
3963 
3964   if (IsD16 && ST.hasUnpackedD16VMem()) {
3965     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3966     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3967     RegTy = S32;
3968   } else {
3969     unsigned EltSize = EltTy.getSizeInBits();
3970     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3971     unsigned RoundedSize = 32 * RoundedElts;
3972     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3973     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3974     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3975   }
3976 
3977   // The return type does not need adjustment.
3978   // TODO: Should we change s16 case to s32 or <2 x s16>?
3979   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3980     return true;
3981 
3982   Register Dst1Reg;
3983 
3984   // Insert after the instruction.
3985   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3986 
3987   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3988   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3989   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3990   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3991 
3992   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3993 
3994   MI.getOperand(0).setReg(NewResultReg);
3995 
3996   // In the IR, TFE is supposed to be used with a 2 element struct return
3997   // type. The intruction really returns these two values in one contiguous
3998   // register, with one additional dword beyond the loaded data. Rewrite the
3999   // return type to use a single register result.
4000 
4001   if (IsTFE) {
4002     Dst1Reg = MI.getOperand(1).getReg();
4003     if (MRI->getType(Dst1Reg) != S32)
4004       return false;
4005 
4006     // TODO: Make sure the TFE operand bit is set.
4007     MI.RemoveOperand(1);
4008 
4009     // Handle the easy case that requires no repack instructions.
4010     if (Ty == S32) {
4011       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4012       return true;
4013     }
4014   }
4015 
4016   // Now figure out how to copy the new result register back into the old
4017   // result.
4018   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4019 
4020   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4021 
4022   if (ResultNumRegs == 1) {
4023     assert(!IsTFE);
4024     ResultRegs[0] = NewResultReg;
4025   } else {
4026     // We have to repack into a new vector of some kind.
4027     for (int I = 0; I != NumDataRegs; ++I)
4028       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4029     B.buildUnmerge(ResultRegs, NewResultReg);
4030 
4031     // Drop the final TFE element to get the data part. The TFE result is
4032     // directly written to the right place already.
4033     if (IsTFE)
4034       ResultRegs.resize(NumDataRegs);
4035   }
4036 
4037   // For an s16 scalar result, we form an s32 result with a truncate regardless
4038   // of packed vs. unpacked.
4039   if (IsD16 && !Ty.isVector()) {
4040     B.buildTrunc(DstReg, ResultRegs[0]);
4041     return true;
4042   }
4043 
4044   // Avoid a build/concat_vector of 1 entry.
4045   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4046     B.buildBitcast(DstReg, ResultRegs[0]);
4047     return true;
4048   }
4049 
4050   assert(Ty.isVector());
4051 
4052   if (IsD16) {
4053     // For packed D16 results with TFE enabled, all the data components are
4054     // S32. Cast back to the expected type.
4055     //
4056     // TODO: We don't really need to use load s32 elements. We would only need one
4057     // cast for the TFE result if a multiple of v2s16 was used.
4058     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4059       for (Register &Reg : ResultRegs)
4060         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4061     } else if (ST.hasUnpackedD16VMem()) {
4062       for (Register &Reg : ResultRegs)
4063         Reg = B.buildTrunc(S16, Reg).getReg(0);
4064     }
4065   }
4066 
4067   auto padWithUndef = [&](LLT Ty, int NumElts) {
4068     if (NumElts == 0)
4069       return;
4070     Register Undef = B.buildUndef(Ty).getReg(0);
4071     for (int I = 0; I != NumElts; ++I)
4072       ResultRegs.push_back(Undef);
4073   };
4074 
4075   // Pad out any elements eliminated due to the dmask.
4076   LLT ResTy = MRI->getType(ResultRegs[0]);
4077   if (!ResTy.isVector()) {
4078     padWithUndef(ResTy, NumElts - ResultRegs.size());
4079     B.buildBuildVector(DstReg, ResultRegs);
4080     return true;
4081   }
4082 
4083   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4084   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4085 
4086   // Deal with the one annoying legal case.
4087   const LLT V3S16 = LLT::vector(3, 16);
4088   if (Ty == V3S16) {
4089     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4090     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4091     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4092     return true;
4093   }
4094 
4095   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4096   B.buildConcatVectors(DstReg, ResultRegs);
4097   return true;
4098 }
4099 
4100 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4101   LegalizerHelper &Helper, MachineInstr &MI) const {
4102   MachineIRBuilder &B = Helper.MIRBuilder;
4103   GISelChangeObserver &Observer = Helper.Observer;
4104 
4105   Register Dst = MI.getOperand(0).getReg();
4106   LLT Ty = B.getMRI()->getType(Dst);
4107   unsigned Size = Ty.getSizeInBits();
4108   MachineFunction &MF = B.getMF();
4109 
4110   Observer.changingInstr(MI);
4111 
4112   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4113     Ty = getBitcastRegisterType(Ty);
4114     Helper.bitcastDst(MI, Ty, 0);
4115     Dst = MI.getOperand(0).getReg();
4116     B.setInsertPt(B.getMBB(), MI);
4117   }
4118 
4119   // FIXME: We don't really need this intermediate instruction. The intrinsic
4120   // should be fixed to have a memory operand. Since it's readnone, we're not
4121   // allowed to add one.
4122   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4123   MI.RemoveOperand(1); // Remove intrinsic ID
4124 
4125   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4126   // TODO: Should this use datalayout alignment?
4127   const unsigned MemSize = (Size + 7) / 8;
4128   const Align MemAlign(4);
4129   MachineMemOperand *MMO = MF.getMachineMemOperand(
4130       MachinePointerInfo(),
4131       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4132           MachineMemOperand::MOInvariant,
4133       MemSize, MemAlign);
4134   MI.addMemOperand(MF, MMO);
4135 
4136   // There are no 96-bit result scalar loads, but widening to 128-bit should
4137   // always be legal. We may need to restore this to a 96-bit result if it turns
4138   // out this needs to be converted to a vector load during RegBankSelect.
4139   if (!isPowerOf2_32(Size)) {
4140     if (Ty.isVector())
4141       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4142     else
4143       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4144   }
4145 
4146   Observer.changedInstr(MI);
4147   return true;
4148 }
4149 
4150 // TODO: Move to selection
4151 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4152                                                 MachineRegisterInfo &MRI,
4153                                                 MachineIRBuilder &B) const {
4154   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4155   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4156       !ST.isTrapHandlerEnabled()) {
4157     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4158   } else {
4159     // Pass queue pointer to trap handler as input, and insert trap instruction
4160     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4161     MachineRegisterInfo &MRI = *B.getMRI();
4162 
4163     Register LiveIn =
4164       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4165     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4166       return false;
4167 
4168     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4169     B.buildCopy(SGPR01, LiveIn);
4170     B.buildInstr(AMDGPU::S_TRAP)
4171         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4172         .addReg(SGPR01, RegState::Implicit);
4173   }
4174 
4175   MI.eraseFromParent();
4176   return true;
4177 }
4178 
4179 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4180     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4181   // Is non-HSA path or trap-handler disabled? then, report a warning
4182   // accordingly
4183   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4184       !ST.isTrapHandlerEnabled()) {
4185     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4186                                      "debugtrap handler not supported",
4187                                      MI.getDebugLoc(), DS_Warning);
4188     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4189     Ctx.diagnose(NoTrap);
4190   } else {
4191     // Insert debug-trap instruction
4192     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4193   }
4194 
4195   MI.eraseFromParent();
4196   return true;
4197 }
4198 
4199 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4200                                             MachineInstr &MI) const {
4201   MachineIRBuilder &B = Helper.MIRBuilder;
4202   MachineRegisterInfo &MRI = *B.getMRI();
4203 
4204   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4205   auto IntrID = MI.getIntrinsicID();
4206   switch (IntrID) {
4207   case Intrinsic::amdgcn_if:
4208   case Intrinsic::amdgcn_else: {
4209     MachineInstr *Br = nullptr;
4210     MachineBasicBlock *UncondBrTarget = nullptr;
4211     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4212       const SIRegisterInfo *TRI
4213         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4214 
4215       Register Def = MI.getOperand(1).getReg();
4216       Register Use = MI.getOperand(3).getReg();
4217 
4218       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4219       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4220       if (IntrID == Intrinsic::amdgcn_if) {
4221         B.buildInstr(AMDGPU::SI_IF)
4222           .addDef(Def)
4223           .addUse(Use)
4224           .addMBB(UncondBrTarget);
4225       } else {
4226         B.buildInstr(AMDGPU::SI_ELSE)
4227           .addDef(Def)
4228           .addUse(Use)
4229           .addMBB(UncondBrTarget)
4230           .addImm(0);
4231       }
4232 
4233       if (Br) {
4234         Br->getOperand(0).setMBB(CondBrTarget);
4235       } else {
4236         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4237         // since we're swapping branch targets it needs to be reinserted.
4238         // FIXME: IRTranslator should probably not do this
4239         B.buildBr(*CondBrTarget);
4240       }
4241 
4242       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4243       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4244       MI.eraseFromParent();
4245       BrCond->eraseFromParent();
4246       return true;
4247     }
4248 
4249     return false;
4250   }
4251   case Intrinsic::amdgcn_loop: {
4252     MachineInstr *Br = nullptr;
4253     MachineBasicBlock *UncondBrTarget = nullptr;
4254     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4255       const SIRegisterInfo *TRI
4256         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4257 
4258       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4259       Register Reg = MI.getOperand(2).getReg();
4260 
4261       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4262       B.buildInstr(AMDGPU::SI_LOOP)
4263         .addUse(Reg)
4264         .addMBB(UncondBrTarget);
4265 
4266       if (Br)
4267         Br->getOperand(0).setMBB(CondBrTarget);
4268       else
4269         B.buildBr(*CondBrTarget);
4270 
4271       MI.eraseFromParent();
4272       BrCond->eraseFromParent();
4273       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4274       return true;
4275     }
4276 
4277     return false;
4278   }
4279   case Intrinsic::amdgcn_kernarg_segment_ptr:
4280     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4281       // This only makes sense to call in a kernel, so just lower to null.
4282       B.buildConstant(MI.getOperand(0).getReg(), 0);
4283       MI.eraseFromParent();
4284       return true;
4285     }
4286 
4287     return legalizePreloadedArgIntrin(
4288       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4289   case Intrinsic::amdgcn_implicitarg_ptr:
4290     return legalizeImplicitArgPtr(MI, MRI, B);
4291   case Intrinsic::amdgcn_workitem_id_x:
4292     return legalizePreloadedArgIntrin(MI, MRI, B,
4293                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4294   case Intrinsic::amdgcn_workitem_id_y:
4295     return legalizePreloadedArgIntrin(MI, MRI, B,
4296                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4297   case Intrinsic::amdgcn_workitem_id_z:
4298     return legalizePreloadedArgIntrin(MI, MRI, B,
4299                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4300   case Intrinsic::amdgcn_workgroup_id_x:
4301     return legalizePreloadedArgIntrin(MI, MRI, B,
4302                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4303   case Intrinsic::amdgcn_workgroup_id_y:
4304     return legalizePreloadedArgIntrin(MI, MRI, B,
4305                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4306   case Intrinsic::amdgcn_workgroup_id_z:
4307     return legalizePreloadedArgIntrin(MI, MRI, B,
4308                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4309   case Intrinsic::amdgcn_dispatch_ptr:
4310     return legalizePreloadedArgIntrin(MI, MRI, B,
4311                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4312   case Intrinsic::amdgcn_queue_ptr:
4313     return legalizePreloadedArgIntrin(MI, MRI, B,
4314                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4315   case Intrinsic::amdgcn_implicit_buffer_ptr:
4316     return legalizePreloadedArgIntrin(
4317       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4318   case Intrinsic::amdgcn_dispatch_id:
4319     return legalizePreloadedArgIntrin(MI, MRI, B,
4320                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4321   case Intrinsic::amdgcn_fdiv_fast:
4322     return legalizeFDIVFastIntrin(MI, MRI, B);
4323   case Intrinsic::amdgcn_is_shared:
4324     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4325   case Intrinsic::amdgcn_is_private:
4326     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4327   case Intrinsic::amdgcn_wavefrontsize: {
4328     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4329     MI.eraseFromParent();
4330     return true;
4331   }
4332   case Intrinsic::amdgcn_s_buffer_load:
4333     return legalizeSBufferLoad(Helper, MI);
4334   case Intrinsic::amdgcn_raw_buffer_store:
4335   case Intrinsic::amdgcn_struct_buffer_store:
4336     return legalizeBufferStore(MI, MRI, B, false, false);
4337   case Intrinsic::amdgcn_raw_buffer_store_format:
4338   case Intrinsic::amdgcn_struct_buffer_store_format:
4339     return legalizeBufferStore(MI, MRI, B, false, true);
4340   case Intrinsic::amdgcn_raw_tbuffer_store:
4341   case Intrinsic::amdgcn_struct_tbuffer_store:
4342     return legalizeBufferStore(MI, MRI, B, true, true);
4343   case Intrinsic::amdgcn_raw_buffer_load:
4344   case Intrinsic::amdgcn_struct_buffer_load:
4345     return legalizeBufferLoad(MI, MRI, B, false, false);
4346   case Intrinsic::amdgcn_raw_buffer_load_format:
4347   case Intrinsic::amdgcn_struct_buffer_load_format:
4348     return legalizeBufferLoad(MI, MRI, B, true, false);
4349   case Intrinsic::amdgcn_raw_tbuffer_load:
4350   case Intrinsic::amdgcn_struct_tbuffer_load:
4351     return legalizeBufferLoad(MI, MRI, B, true, true);
4352   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4353   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4354   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4355   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4356   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4357   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4358   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4359   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4360   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4361   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4362   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4363   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4364   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4365   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4366   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4374   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4375   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4376   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4377   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4378     return legalizeBufferAtomic(MI, B, IntrID);
4379   case Intrinsic::amdgcn_atomic_inc:
4380     return legalizeAtomicIncDec(MI, B, true);
4381   case Intrinsic::amdgcn_atomic_dec:
4382     return legalizeAtomicIncDec(MI, B, false);
4383   case Intrinsic::trap:
4384     return legalizeTrapIntrinsic(MI, MRI, B);
4385   case Intrinsic::debugtrap:
4386     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4387   default: {
4388     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4389             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4390       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4391     return true;
4392   }
4393   }
4394 
4395   return true;
4396 }
4397