1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts()) {
426     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
427       .legalFor({S32, S16, V2S16})
428       .clampScalar(0, S16, S32)
429       .clampMaxNumElements(0, S16, 2)
430       .scalarize(0)
431       .widenScalarToNextPow2(0, 32);
432 
433     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
434       .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul
435       .minScalar(0, S16)
436       .clampMaxNumElements(0, S16, 2)
437       .scalarize(0)
438       .widenScalarToNextPow2(0, 32)
439       .lower();
440   } else if (ST.has16BitInsts()) {
441     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
442       .legalFor({S32, S16})
443       .clampScalar(0, S16, S32)
444       .scalarize(0)
445       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
446 
447     // Technically the saturating operations require clamp bit support, but this
448     // was introduced at the same time as 16-bit operations.
449     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
450       .lowerFor({S32, S16}) // FIXME: legal with clamp modifier
451       .minScalar(0, S16)
452       .scalarize(0)
453       .widenScalarToNextPow2(0, 16)
454       .lower();
455 
456     // We're just lowering this, but it helps get a better result to try to
457     // coerce to the desired type first.
458     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
459       .minScalar(0, S16)
460       .scalarize(0)
461       .lower();
462   } else {
463     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
464       .legalFor({S32})
465       .clampScalar(0, S32, S32)
466       .scalarize(0);
467 
468     if (ST.hasIntClamp()) {
469       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
470         .lowerFor({S32}) // FIXME: legal with clamp modifier.
471         .scalarize(0)
472         .minScalarOrElt(0, S32)
473         .lower();
474     } else {
475       // Clamp bit support was added in VI, along with 16-bit operations.
476       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
477         .minScalar(0, S32)
478         .scalarize(0)
479         .lower();
480     }
481 
482     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
483       .minScalar(0, S32)
484       .scalarize(0)
485       .lower();
486   }
487 
488   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
489     .customFor({S32, S64})
490     .clampScalar(0, S32, S64)
491     .widenScalarToNextPow2(0, 32)
492     .scalarize(0);
493 
494   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
495     .legalFor({S32})
496     .clampScalar(0, S32, S32)
497     .scalarize(0);
498 
499   // Report legal for any types we can handle anywhere. For the cases only legal
500   // on the SALU, RegBankSelect will be able to re-legalize.
501   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
502     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
503     .clampScalar(0, S32, S64)
504     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
505     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
506     .widenScalarToNextPow2(0)
507     .scalarize(0);
508 
509   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
510                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
511     .legalFor({{S32, S1}, {S32, S32}})
512     .minScalar(0, S32)
513     // TODO: .scalarize(0)
514     .lower();
515 
516   getActionDefinitionsBuilder(G_BITCAST)
517     // Don't worry about the size constraint.
518     .legalIf(all(isRegisterType(0), isRegisterType(1)))
519     .lower();
520 
521 
522   getActionDefinitionsBuilder(G_CONSTANT)
523     .legalFor({S1, S32, S64, S16, GlobalPtr,
524                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
525     .legalIf(isPointer(0))
526     .clampScalar(0, S32, S64)
527     .widenScalarToNextPow2(0);
528 
529   getActionDefinitionsBuilder(G_FCONSTANT)
530     .legalFor({S32, S64, S16})
531     .clampScalar(0, S16, S64);
532 
533   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
534       .legalIf(isRegisterType(0))
535       // s1 and s16 are special cases because they have legal operations on
536       // them, but don't really occupy registers in the normal way.
537       .legalFor({S1, S16})
538       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
539       .clampScalarOrElt(0, S32, MaxScalar)
540       .widenScalarToNextPow2(0, 32)
541       .clampMaxNumElements(0, S32, 16);
542 
543   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
544 
545   // If the amount is divergent, we have to do a wave reduction to get the
546   // maximum value, so this is expanded during RegBankSelect.
547   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
548     .legalFor({{PrivatePtr, S32}});
549 
550   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
551     .unsupportedFor({PrivatePtr})
552     .custom();
553   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
554 
555   auto &FPOpActions = getActionDefinitionsBuilder(
556     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
557     .legalFor({S32, S64});
558   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
559     .customFor({S32, S64});
560   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
561     .customFor({S32, S64});
562 
563   if (ST.has16BitInsts()) {
564     if (ST.hasVOP3PInsts())
565       FPOpActions.legalFor({S16, V2S16});
566     else
567       FPOpActions.legalFor({S16});
568 
569     TrigActions.customFor({S16});
570     FDIVActions.customFor({S16});
571   }
572 
573   auto &MinNumMaxNum = getActionDefinitionsBuilder({
574       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
575 
576   if (ST.hasVOP3PInsts()) {
577     MinNumMaxNum.customFor(FPTypesPK16)
578       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
579       .clampMaxNumElements(0, S16, 2)
580       .clampScalar(0, S16, S64)
581       .scalarize(0);
582   } else if (ST.has16BitInsts()) {
583     MinNumMaxNum.customFor(FPTypes16)
584       .clampScalar(0, S16, S64)
585       .scalarize(0);
586   } else {
587     MinNumMaxNum.customFor(FPTypesBase)
588       .clampScalar(0, S32, S64)
589       .scalarize(0);
590   }
591 
592   if (ST.hasVOP3PInsts())
593     FPOpActions.clampMaxNumElements(0, S16, 2);
594 
595   FPOpActions
596     .scalarize(0)
597     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
598 
599   TrigActions
600     .scalarize(0)
601     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
602 
603   FDIVActions
604     .scalarize(0)
605     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
606 
607   getActionDefinitionsBuilder({G_FNEG, G_FABS})
608     .legalFor(FPTypesPK16)
609     .clampMaxNumElements(0, S16, 2)
610     .scalarize(0)
611     .clampScalar(0, S16, S64);
612 
613   if (ST.has16BitInsts()) {
614     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
615       .legalFor({S32, S64, S16})
616       .scalarize(0)
617       .clampScalar(0, S16, S64);
618   } else {
619     getActionDefinitionsBuilder(G_FSQRT)
620       .legalFor({S32, S64})
621       .scalarize(0)
622       .clampScalar(0, S32, S64);
623 
624     if (ST.hasFractBug()) {
625       getActionDefinitionsBuilder(G_FFLOOR)
626         .customFor({S64})
627         .legalFor({S32, S64})
628         .scalarize(0)
629         .clampScalar(0, S32, S64);
630     } else {
631       getActionDefinitionsBuilder(G_FFLOOR)
632         .legalFor({S32, S64})
633         .scalarize(0)
634         .clampScalar(0, S32, S64);
635     }
636   }
637 
638   getActionDefinitionsBuilder(G_FPTRUNC)
639     .legalFor({{S32, S64}, {S16, S32}})
640     .scalarize(0)
641     .lower();
642 
643   getActionDefinitionsBuilder(G_FPEXT)
644     .legalFor({{S64, S32}, {S32, S16}})
645     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
646     .scalarize(0);
647 
648   getActionDefinitionsBuilder(G_FSUB)
649       // Use actual fsub instruction
650       .legalFor({S32})
651       // Must use fadd + fneg
652       .lowerFor({S64, S16, V2S16})
653       .scalarize(0)
654       .clampScalar(0, S32, S64);
655 
656   // Whether this is legal depends on the floating point mode for the function.
657   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
658   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
659     FMad.customFor({S32, S16});
660   else if (ST.hasMadMacF32Insts())
661     FMad.customFor({S32});
662   else if (ST.hasMadF16())
663     FMad.customFor({S16});
664   FMad.scalarize(0)
665       .lower();
666 
667   // TODO: Do we need to clamp maximum bitwidth?
668   getActionDefinitionsBuilder(G_TRUNC)
669     .legalIf(isScalar(0))
670     .legalFor({{V2S16, V2S32}})
671     .clampMaxNumElements(0, S16, 2)
672     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
673     // situations (like an invalid implicit use), we don't want to infinite loop
674     // in the legalizer.
675     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
676     .alwaysLegal();
677 
678   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
679     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
680                {S32, S1}, {S64, S1}, {S16, S1}})
681     .scalarize(0)
682     .clampScalar(0, S32, S64)
683     .widenScalarToNextPow2(1, 32);
684 
685   // TODO: Split s1->s64 during regbankselect for VALU.
686   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
687     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
688     .lowerFor({{S32, S64}})
689     .lowerIf(typeIs(1, S1))
690     .customFor({{S64, S64}});
691   if (ST.has16BitInsts())
692     IToFP.legalFor({{S16, S16}});
693   IToFP.clampScalar(1, S32, S64)
694        .minScalar(0, S32)
695        .scalarize(0)
696        .widenScalarToNextPow2(1);
697 
698   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
699     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
700     .customFor({{S64, S64}})
701     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
702   if (ST.has16BitInsts())
703     FPToI.legalFor({{S16, S16}});
704   else
705     FPToI.minScalar(1, S32);
706 
707   FPToI.minScalar(0, S32)
708        .scalarize(0)
709        .lower();
710 
711   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
712     .scalarize(0)
713     .lower();
714 
715   if (ST.has16BitInsts()) {
716     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
717       .legalFor({S16, S32, S64})
718       .clampScalar(0, S16, S64)
719       .scalarize(0);
720   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
721     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
722       .legalFor({S32, S64})
723       .clampScalar(0, S32, S64)
724       .scalarize(0);
725   } else {
726     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
727       .legalFor({S32})
728       .customFor({S64})
729       .clampScalar(0, S32, S64)
730       .scalarize(0);
731   }
732 
733   getActionDefinitionsBuilder(G_PTR_ADD)
734     .legalIf(all(isPointer(0), sameSize(0, 1)))
735     .scalarize(0)
736     .scalarSameSizeAs(1, 0);
737 
738   getActionDefinitionsBuilder(G_PTRMASK)
739     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
740     .scalarSameSizeAs(1, 0)
741     .scalarize(0);
742 
743   auto &CmpBuilder =
744     getActionDefinitionsBuilder(G_ICMP)
745     // The compare output type differs based on the register bank of the output,
746     // so make both s1 and s32 legal.
747     //
748     // Scalar compares producing output in scc will be promoted to s32, as that
749     // is the allocatable register type that will be needed for the copy from
750     // scc. This will be promoted during RegBankSelect, and we assume something
751     // before that won't try to use s32 result types.
752     //
753     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
754     // bank.
755     .legalForCartesianProduct(
756       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
757     .legalForCartesianProduct(
758       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
759   if (ST.has16BitInsts()) {
760     CmpBuilder.legalFor({{S1, S16}});
761   }
762 
763   CmpBuilder
764     .widenScalarToNextPow2(1)
765     .clampScalar(1, S32, S64)
766     .scalarize(0)
767     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
768 
769   getActionDefinitionsBuilder(G_FCMP)
770     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
771     .widenScalarToNextPow2(1)
772     .clampScalar(1, S32, S64)
773     .scalarize(0);
774 
775   // FIXME: fpow has a selection pattern that should move to custom lowering.
776   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
777   if (ST.has16BitInsts())
778     Exp2Ops.legalFor({S32, S16});
779   else
780     Exp2Ops.legalFor({S32});
781   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
782   Exp2Ops.scalarize(0);
783 
784   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
785   if (ST.has16BitInsts())
786     ExpOps.customFor({{S32}, {S16}});
787   else
788     ExpOps.customFor({S32});
789   ExpOps.clampScalar(0, MinScalarFPTy, S32)
790         .scalarize(0);
791 
792   getActionDefinitionsBuilder(G_FPOWI)
793     .clampScalar(0, MinScalarFPTy, S32)
794     .lower();
795 
796   // The 64-bit versions produce 32-bit results, but only on the SALU.
797   getActionDefinitionsBuilder(G_CTPOP)
798     .legalFor({{S32, S32}, {S32, S64}})
799     .clampScalar(0, S32, S32)
800     .clampScalar(1, S32, S64)
801     .scalarize(0)
802     .widenScalarToNextPow2(0, 32)
803     .widenScalarToNextPow2(1, 32);
804 
805   // The hardware instructions return a different result on 0 than the generic
806   // instructions expect. The hardware produces -1, but these produce the
807   // bitwidth.
808   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
809     .scalarize(0)
810     .clampScalar(0, S32, S32)
811     .clampScalar(1, S32, S64)
812     .widenScalarToNextPow2(0, 32)
813     .widenScalarToNextPow2(1, 32)
814     .lower();
815 
816   // The 64-bit versions produce 32-bit results, but only on the SALU.
817   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
818     .legalFor({{S32, S32}, {S32, S64}})
819     .clampScalar(0, S32, S32)
820     .clampScalar(1, S32, S64)
821     .scalarize(0)
822     .widenScalarToNextPow2(0, 32)
823     .widenScalarToNextPow2(1, 32);
824 
825   getActionDefinitionsBuilder(G_BITREVERSE)
826     .legalFor({S32})
827     .clampScalar(0, S32, S32)
828     .scalarize(0);
829 
830   if (ST.has16BitInsts()) {
831     getActionDefinitionsBuilder(G_BSWAP)
832       .legalFor({S16, S32, V2S16})
833       .clampMaxNumElements(0, S16, 2)
834       // FIXME: Fixing non-power-of-2 before clamp is workaround for
835       // narrowScalar limitation.
836       .widenScalarToNextPow2(0)
837       .clampScalar(0, S16, S32)
838       .scalarize(0);
839 
840     if (ST.hasVOP3PInsts()) {
841       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
842         .legalFor({S32, S16, V2S16})
843         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
844         .clampMaxNumElements(0, S16, 2)
845         .minScalar(0, S16)
846         .widenScalarToNextPow2(0)
847         .scalarize(0)
848         .lower();
849     } else {
850       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
851         .legalFor({S32, S16})
852         .widenScalarToNextPow2(0)
853         .minScalar(0, S16)
854         .scalarize(0)
855         .lower();
856     }
857   } else {
858     // TODO: Should have same legality without v_perm_b32
859     getActionDefinitionsBuilder(G_BSWAP)
860       .legalFor({S32})
861       .lowerIf(scalarNarrowerThan(0, 32))
862       // FIXME: Fixing non-power-of-2 before clamp is workaround for
863       // narrowScalar limitation.
864       .widenScalarToNextPow2(0)
865       .maxScalar(0, S32)
866       .scalarize(0)
867       .lower();
868 
869     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
870       .legalFor({S32})
871       .minScalar(0, S32)
872       .widenScalarToNextPow2(0)
873       .scalarize(0)
874       .lower();
875   }
876 
877   getActionDefinitionsBuilder(G_INTTOPTR)
878     // List the common cases
879     .legalForCartesianProduct(AddrSpaces64, {S64})
880     .legalForCartesianProduct(AddrSpaces32, {S32})
881     .scalarize(0)
882     // Accept any address space as long as the size matches
883     .legalIf(sameSize(0, 1))
884     .widenScalarIf(smallerThan(1, 0),
885       [](const LegalityQuery &Query) {
886         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
887       })
888     .narrowScalarIf(largerThan(1, 0),
889       [](const LegalityQuery &Query) {
890         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
891       });
892 
893   getActionDefinitionsBuilder(G_PTRTOINT)
894     // List the common cases
895     .legalForCartesianProduct(AddrSpaces64, {S64})
896     .legalForCartesianProduct(AddrSpaces32, {S32})
897     .scalarize(0)
898     // Accept any address space as long as the size matches
899     .legalIf(sameSize(0, 1))
900     .widenScalarIf(smallerThan(0, 1),
901       [](const LegalityQuery &Query) {
902         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
903       })
904     .narrowScalarIf(
905       largerThan(0, 1),
906       [](const LegalityQuery &Query) {
907         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
908       });
909 
910   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
911     .scalarize(0)
912     .custom();
913 
914   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
915                                     bool IsLoad) -> bool {
916     const LLT DstTy = Query.Types[0];
917 
918     // Split vector extloads.
919     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
920     unsigned Align = Query.MMODescrs[0].AlignInBits;
921 
922     if (MemSize < DstTy.getSizeInBits())
923       MemSize = std::max(MemSize, Align);
924 
925     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
926       return true;
927 
928     const LLT PtrTy = Query.Types[1];
929     unsigned AS = PtrTy.getAddressSpace();
930     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
931       return true;
932 
933     // Catch weird sized loads that don't evenly divide into the access sizes
934     // TODO: May be able to widen depending on alignment etc.
935     unsigned NumRegs = (MemSize + 31) / 32;
936     if (NumRegs == 3) {
937       if (!ST.hasDwordx3LoadStores())
938         return true;
939     } else {
940       // If the alignment allows, these should have been widened.
941       if (!isPowerOf2_32(NumRegs))
942         return true;
943     }
944 
945     if (Align < MemSize) {
946       const SITargetLowering *TLI = ST.getTargetLowering();
947       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
948     }
949 
950     return false;
951   };
952 
953   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
954                                          unsigned Opc) -> bool {
955     unsigned Size = Query.Types[0].getSizeInBits();
956     if (isPowerOf2_32(Size))
957       return false;
958 
959     if (Size == 96 && ST.hasDwordx3LoadStores())
960       return false;
961 
962     unsigned AddrSpace = Query.Types[1].getAddressSpace();
963     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
964       return false;
965 
966     unsigned Align = Query.MMODescrs[0].AlignInBits;
967     unsigned RoundedSize = NextPowerOf2(Size);
968     return (Align >= RoundedSize);
969   };
970 
971   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
972   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
973   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
974 
975   // TODO: Refine based on subtargets which support unaligned access or 128-bit
976   // LDS
977   // TODO: Unsupported flat for SI.
978 
979   for (unsigned Op : {G_LOAD, G_STORE}) {
980     const bool IsStore = Op == G_STORE;
981 
982     auto &Actions = getActionDefinitionsBuilder(Op);
983     // Explicitly list some common cases.
984     // TODO: Does this help compile time at all?
985     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
986                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
987                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
988                                       {S64, GlobalPtr, 64, GlobalAlign32},
989                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
990                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
991                                       {S32, GlobalPtr, 8, GlobalAlign8},
992                                       {S32, GlobalPtr, 16, GlobalAlign16},
993 
994                                       {S32, LocalPtr, 32, 32},
995                                       {S64, LocalPtr, 64, 32},
996                                       {V2S32, LocalPtr, 64, 32},
997                                       {S32, LocalPtr, 8, 8},
998                                       {S32, LocalPtr, 16, 16},
999                                       {V2S16, LocalPtr, 32, 32},
1000 
1001                                       {S32, PrivatePtr, 32, 32},
1002                                       {S32, PrivatePtr, 8, 8},
1003                                       {S32, PrivatePtr, 16, 16},
1004                                       {V2S16, PrivatePtr, 32, 32},
1005 
1006                                       {S32, ConstantPtr, 32, GlobalAlign32},
1007                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1008                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1009                                       {S64, ConstantPtr, 64, GlobalAlign32},
1010                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1011     Actions.legalIf(
1012       [=](const LegalityQuery &Query) -> bool {
1013         return isLoadStoreLegal(ST, Query, Op);
1014       });
1015 
1016     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1017     // 64-bits.
1018     //
1019     // TODO: Should generalize bitcast action into coerce, which will also cover
1020     // inserting addrspacecasts.
1021     Actions.customIf(typeIs(1, Constant32Ptr));
1022 
1023     // Turn any illegal element vectors into something easier to deal
1024     // with. These will ultimately produce 32-bit scalar shifts to extract the
1025     // parts anyway.
1026     //
1027     // For odd 16-bit element vectors, prefer to split those into pieces with
1028     // 16-bit vector parts.
1029     Actions.bitcastIf(
1030       [=](const LegalityQuery &Query) -> bool {
1031         const LLT Ty = Query.Types[0];
1032         const unsigned Size = Ty.getSizeInBits();
1033 
1034         if (Size != Query.MMODescrs[0].SizeInBits)
1035           return Size <= 32 && Ty.isVector();
1036 
1037         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1038           return true;
1039         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1040                !isRegisterVectorElementType(Ty.getElementType());
1041       }, bitcastToRegisterType(0));
1042 
1043     Actions
1044         .customIf(typeIs(1, Constant32Ptr))
1045         // Widen suitably aligned loads by loading extra elements.
1046         .moreElementsIf([=](const LegalityQuery &Query) {
1047             const LLT Ty = Query.Types[0];
1048             return Op == G_LOAD && Ty.isVector() &&
1049                    shouldWidenLoadResult(Query, Op);
1050           }, moreElementsToNextPow2(0))
1051         .widenScalarIf([=](const LegalityQuery &Query) {
1052             const LLT Ty = Query.Types[0];
1053             return Op == G_LOAD && !Ty.isVector() &&
1054                    shouldWidenLoadResult(Query, Op);
1055           }, widenScalarOrEltToNextPow2(0))
1056         .narrowScalarIf(
1057             [=](const LegalityQuery &Query) -> bool {
1058               return !Query.Types[0].isVector() &&
1059                      needToSplitMemOp(Query, Op == G_LOAD);
1060             },
1061             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1062               const LLT DstTy = Query.Types[0];
1063               const LLT PtrTy = Query.Types[1];
1064 
1065               const unsigned DstSize = DstTy.getSizeInBits();
1066               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1067 
1068               // Split extloads.
1069               if (DstSize > MemSize)
1070                 return std::make_pair(0, LLT::scalar(MemSize));
1071 
1072               if (!isPowerOf2_32(DstSize)) {
1073                 // We're probably decomposing an odd sized store. Try to split
1074                 // to the widest type. TODO: Account for alignment. As-is it
1075                 // should be OK, since the new parts will be further legalized.
1076                 unsigned FloorSize = PowerOf2Floor(DstSize);
1077                 return std::make_pair(0, LLT::scalar(FloorSize));
1078               }
1079 
1080               if (DstSize > 32 && (DstSize % 32 != 0)) {
1081                 // FIXME: Need a way to specify non-extload of larger size if
1082                 // suitably aligned.
1083                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1084               }
1085 
1086               unsigned MaxSize = maxSizeForAddrSpace(ST,
1087                                                      PtrTy.getAddressSpace(),
1088                                                      Op == G_LOAD);
1089               if (MemSize > MaxSize)
1090                 return std::make_pair(0, LLT::scalar(MaxSize));
1091 
1092               unsigned Align = Query.MMODescrs[0].AlignInBits;
1093               return std::make_pair(0, LLT::scalar(Align));
1094             })
1095         .fewerElementsIf(
1096             [=](const LegalityQuery &Query) -> bool {
1097               return Query.Types[0].isVector() &&
1098                      needToSplitMemOp(Query, Op == G_LOAD);
1099             },
1100             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1101               const LLT DstTy = Query.Types[0];
1102               const LLT PtrTy = Query.Types[1];
1103 
1104               LLT EltTy = DstTy.getElementType();
1105               unsigned MaxSize = maxSizeForAddrSpace(ST,
1106                                                      PtrTy.getAddressSpace(),
1107                                                      Op == G_LOAD);
1108 
1109               // FIXME: Handle widened to power of 2 results better. This ends
1110               // up scalarizing.
1111               // FIXME: 3 element stores scalarized on SI
1112 
1113               // Split if it's too large for the address space.
1114               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1115                 unsigned NumElts = DstTy.getNumElements();
1116                 unsigned EltSize = EltTy.getSizeInBits();
1117 
1118                 if (MaxSize % EltSize == 0) {
1119                   return std::make_pair(
1120                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1121                 }
1122 
1123                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1124 
1125                 // FIXME: Refine when odd breakdowns handled
1126                 // The scalars will need to be re-legalized.
1127                 if (NumPieces == 1 || NumPieces >= NumElts ||
1128                     NumElts % NumPieces != 0)
1129                   return std::make_pair(0, EltTy);
1130 
1131                 return std::make_pair(0,
1132                                       LLT::vector(NumElts / NumPieces, EltTy));
1133               }
1134 
1135               // FIXME: We could probably handle weird extending loads better.
1136               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1137               if (DstTy.getSizeInBits() > MemSize)
1138                 return std::make_pair(0, EltTy);
1139 
1140               unsigned EltSize = EltTy.getSizeInBits();
1141               unsigned DstSize = DstTy.getSizeInBits();
1142               if (!isPowerOf2_32(DstSize)) {
1143                 // We're probably decomposing an odd sized store. Try to split
1144                 // to the widest type. TODO: Account for alignment. As-is it
1145                 // should be OK, since the new parts will be further legalized.
1146                 unsigned FloorSize = PowerOf2Floor(DstSize);
1147                 return std::make_pair(
1148                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1149               }
1150 
1151               // Need to split because of alignment.
1152               unsigned Align = Query.MMODescrs[0].AlignInBits;
1153               if (EltSize > Align &&
1154                   (EltSize / Align < DstTy.getNumElements())) {
1155                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1156               }
1157 
1158               // May need relegalization for the scalars.
1159               return std::make_pair(0, EltTy);
1160             })
1161         .minScalar(0, S32);
1162 
1163     if (IsStore)
1164       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1165 
1166     // TODO: Need a bitcast lower option?
1167     Actions
1168         .widenScalarToNextPow2(0)
1169         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1170   }
1171 
1172   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1173                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1174                                                   {S32, GlobalPtr, 16, 2 * 8},
1175                                                   {S32, LocalPtr, 8, 8},
1176                                                   {S32, LocalPtr, 16, 16},
1177                                                   {S32, PrivatePtr, 8, 8},
1178                                                   {S32, PrivatePtr, 16, 16},
1179                                                   {S32, ConstantPtr, 8, 8},
1180                                                   {S32, ConstantPtr, 16, 2 * 8}});
1181   if (ST.hasFlatAddressSpace()) {
1182     ExtLoads.legalForTypesWithMemDesc(
1183         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1184   }
1185 
1186   ExtLoads.clampScalar(0, S32, S32)
1187           .widenScalarToNextPow2(0)
1188           .unsupportedIfMemSizeNotPow2()
1189           .lower();
1190 
1191   auto &Atomics = getActionDefinitionsBuilder(
1192     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1193      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1194      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1195      G_ATOMICRMW_UMIN})
1196     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1197                {S64, GlobalPtr}, {S64, LocalPtr},
1198                {S32, RegionPtr}, {S64, RegionPtr}});
1199   if (ST.hasFlatAddressSpace()) {
1200     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1201   }
1202 
1203   if (ST.hasLDSFPAtomics()) {
1204     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1205       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1206   }
1207 
1208   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1209   // demarshalling
1210   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1211     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1212                 {S32, FlatPtr}, {S64, FlatPtr}})
1213     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1214                {S32, RegionPtr}, {S64, RegionPtr}});
1215   // TODO: Pointer types, any 32-bit or 64-bit vector
1216 
1217   // Condition should be s32 for scalar, s1 for vector.
1218   getActionDefinitionsBuilder(G_SELECT)
1219     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1220           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1221           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1222     .clampScalar(0, S16, S64)
1223     .scalarize(1)
1224     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1225     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1226     .clampMaxNumElements(0, S32, 2)
1227     .clampMaxNumElements(0, LocalPtr, 2)
1228     .clampMaxNumElements(0, PrivatePtr, 2)
1229     .scalarize(0)
1230     .widenScalarToNextPow2(0)
1231     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1232 
1233   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1234   // be more flexible with the shift amount type.
1235   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1236     .legalFor({{S32, S32}, {S64, S32}});
1237   if (ST.has16BitInsts()) {
1238     if (ST.hasVOP3PInsts()) {
1239       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1240             .clampMaxNumElements(0, S16, 2);
1241     } else
1242       Shifts.legalFor({{S16, S16}});
1243 
1244     // TODO: Support 16-bit shift amounts for all types
1245     Shifts.widenScalarIf(
1246       [=](const LegalityQuery &Query) {
1247         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1248         // 32-bit amount.
1249         const LLT ValTy = Query.Types[0];
1250         const LLT AmountTy = Query.Types[1];
1251         return ValTy.getSizeInBits() <= 16 &&
1252                AmountTy.getSizeInBits() < 16;
1253       }, changeTo(1, S16));
1254     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1255     Shifts.clampScalar(1, S32, S32);
1256     Shifts.clampScalar(0, S16, S64);
1257     Shifts.widenScalarToNextPow2(0, 16);
1258   } else {
1259     // Make sure we legalize the shift amount type first, as the general
1260     // expansion for the shifted type will produce much worse code if it hasn't
1261     // been truncated already.
1262     Shifts.clampScalar(1, S32, S32);
1263     Shifts.clampScalar(0, S32, S64);
1264     Shifts.widenScalarToNextPow2(0, 32);
1265   }
1266   Shifts.scalarize(0);
1267 
1268   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1269     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1270     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1271     unsigned IdxTypeIdx = 2;
1272 
1273     getActionDefinitionsBuilder(Op)
1274       .customIf([=](const LegalityQuery &Query) {
1275           const LLT EltTy = Query.Types[EltTypeIdx];
1276           const LLT VecTy = Query.Types[VecTypeIdx];
1277           const LLT IdxTy = Query.Types[IdxTypeIdx];
1278           return (EltTy.getSizeInBits() == 16 ||
1279                   EltTy.getSizeInBits() % 32 == 0) &&
1280                  VecTy.getSizeInBits() % 32 == 0 &&
1281                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1282                  IdxTy.getSizeInBits() == 32;
1283         })
1284       .clampScalar(EltTypeIdx, S32, S64)
1285       .clampScalar(VecTypeIdx, S32, S64)
1286       .clampScalar(IdxTypeIdx, S32, S32);
1287   }
1288 
1289   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1290     .unsupportedIf([=](const LegalityQuery &Query) {
1291         const LLT &EltTy = Query.Types[1].getElementType();
1292         return Query.Types[0] != EltTy;
1293       });
1294 
1295   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1296     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1297     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1298 
1299     // FIXME: Doesn't handle extract of illegal sizes.
1300     getActionDefinitionsBuilder(Op)
1301       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1302       // FIXME: Multiples of 16 should not be legal.
1303       .legalIf([=](const LegalityQuery &Query) {
1304           const LLT BigTy = Query.Types[BigTyIdx];
1305           const LLT LitTy = Query.Types[LitTyIdx];
1306           return (BigTy.getSizeInBits() % 32 == 0) &&
1307                  (LitTy.getSizeInBits() % 16 == 0);
1308         })
1309       .widenScalarIf(
1310         [=](const LegalityQuery &Query) {
1311           const LLT BigTy = Query.Types[BigTyIdx];
1312           return (BigTy.getScalarSizeInBits() < 16);
1313         },
1314         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1315       .widenScalarIf(
1316         [=](const LegalityQuery &Query) {
1317           const LLT LitTy = Query.Types[LitTyIdx];
1318           return (LitTy.getScalarSizeInBits() < 16);
1319         },
1320         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1321       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1322       .widenScalarToNextPow2(BigTyIdx, 32);
1323 
1324   }
1325 
1326   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1327     .legalForCartesianProduct(AllS32Vectors, {S32})
1328     .legalForCartesianProduct(AllS64Vectors, {S64})
1329     .clampNumElements(0, V16S32, V32S32)
1330     .clampNumElements(0, V2S64, V16S64)
1331     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1332 
1333   if (ST.hasScalarPackInsts()) {
1334     BuildVector
1335       // FIXME: Should probably widen s1 vectors straight to s32
1336       .minScalarOrElt(0, S16)
1337       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1338       .minScalar(1, S32);
1339 
1340     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1341       .legalFor({V2S16, S32})
1342       .lower();
1343     BuildVector.minScalarOrElt(0, S32);
1344   } else {
1345     BuildVector.customFor({V2S16, S16});
1346     BuildVector.minScalarOrElt(0, S32);
1347 
1348     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1349       .customFor({V2S16, S32})
1350       .lower();
1351   }
1352 
1353   BuildVector.legalIf(isRegisterType(0));
1354 
1355   // FIXME: Clamp maximum size
1356   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1357     .legalIf(isRegisterType(0));
1358 
1359   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1360   // pre-legalize.
1361   if (ST.hasVOP3PInsts()) {
1362     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1363       .customFor({V2S16, V2S16})
1364       .lower();
1365   } else
1366     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1367 
1368   // Merge/Unmerge
1369   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1370     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1371     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1372 
1373     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1374       const LLT Ty = Query.Types[TypeIdx];
1375       if (Ty.isVector()) {
1376         const LLT &EltTy = Ty.getElementType();
1377         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1378           return true;
1379         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1380           return true;
1381       }
1382       return false;
1383     };
1384 
1385     auto &Builder = getActionDefinitionsBuilder(Op)
1386       .lowerFor({{S16, V2S16}})
1387       .lowerIf([=](const LegalityQuery &Query) {
1388           const LLT BigTy = Query.Types[BigTyIdx];
1389           return BigTy.getSizeInBits() == 32;
1390         })
1391       // Try to widen to s16 first for small types.
1392       // TODO: Only do this on targets with legal s16 shifts
1393       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1394       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1395       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1396       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1397                            elementTypeIs(1, S16)),
1398                        changeTo(1, V2S16))
1399       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1400       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1401       // valid.
1402       .clampScalar(LitTyIdx, S32, S512)
1403       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1404       // Break up vectors with weird elements into scalars
1405       .fewerElementsIf(
1406         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1407         scalarize(0))
1408       .fewerElementsIf(
1409         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1410         scalarize(1))
1411       .clampScalar(BigTyIdx, S32, MaxScalar);
1412 
1413     if (Op == G_MERGE_VALUES) {
1414       Builder.widenScalarIf(
1415         // TODO: Use 16-bit shifts if legal for 8-bit values?
1416         [=](const LegalityQuery &Query) {
1417           const LLT Ty = Query.Types[LitTyIdx];
1418           return Ty.getSizeInBits() < 32;
1419         },
1420         changeTo(LitTyIdx, S32));
1421     }
1422 
1423     Builder.widenScalarIf(
1424       [=](const LegalityQuery &Query) {
1425         const LLT Ty = Query.Types[BigTyIdx];
1426         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1427           Ty.getSizeInBits() % 16 != 0;
1428       },
1429       [=](const LegalityQuery &Query) {
1430         // Pick the next power of 2, or a multiple of 64 over 128.
1431         // Whichever is smaller.
1432         const LLT &Ty = Query.Types[BigTyIdx];
1433         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1434         if (NewSizeInBits >= 256) {
1435           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1436           if (RoundedTo < NewSizeInBits)
1437             NewSizeInBits = RoundedTo;
1438         }
1439         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1440       })
1441       .legalIf([=](const LegalityQuery &Query) {
1442           const LLT &BigTy = Query.Types[BigTyIdx];
1443           const LLT &LitTy = Query.Types[LitTyIdx];
1444 
1445           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1446             return false;
1447           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1448             return false;
1449 
1450           return BigTy.getSizeInBits() % 16 == 0 &&
1451                  LitTy.getSizeInBits() % 16 == 0 &&
1452                  BigTy.getSizeInBits() <= MaxRegisterSize;
1453         })
1454       // Any vectors left are the wrong size. Scalarize them.
1455       .scalarize(0)
1456       .scalarize(1);
1457   }
1458 
1459   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1460   // RegBankSelect.
1461   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1462     .legalFor({{S32}, {S64}});
1463 
1464   if (ST.hasVOP3PInsts()) {
1465     SextInReg.lowerFor({{V2S16}})
1466       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1467       // get more vector shift opportunities, since we'll get those when
1468       // expanded.
1469       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1470   } else if (ST.has16BitInsts()) {
1471     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1472   } else {
1473     // Prefer to promote to s32 before lowering if we don't have 16-bit
1474     // shifts. This avoid a lot of intermediate truncate and extend operations.
1475     SextInReg.lowerFor({{S32}, {S64}});
1476   }
1477 
1478   SextInReg
1479     .scalarize(0)
1480     .clampScalar(0, S32, S64)
1481     .lower();
1482 
1483   getActionDefinitionsBuilder(G_FSHR)
1484     .legalFor({{S32, S32}})
1485     .scalarize(0)
1486     .lower();
1487 
1488   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1489     .legalFor({S64});
1490 
1491   getActionDefinitionsBuilder({
1492       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1493       G_FCOPYSIGN,
1494 
1495       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1496       G_READ_REGISTER,
1497       G_WRITE_REGISTER,
1498 
1499       G_SADDO, G_SSUBO,
1500 
1501        // TODO: Implement
1502       G_FMINIMUM, G_FMAXIMUM,
1503       G_FSHL
1504     }).lower();
1505 
1506   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1507         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1508         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1509     .unsupported();
1510 
1511   computeTables();
1512   verify(*ST.getInstrInfo());
1513 }
1514 
1515 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1516                                          MachineInstr &MI) const {
1517   MachineIRBuilder &B = Helper.MIRBuilder;
1518   MachineRegisterInfo &MRI = *B.getMRI();
1519   GISelChangeObserver &Observer = Helper.Observer;
1520 
1521   switch (MI.getOpcode()) {
1522   case TargetOpcode::G_ADDRSPACE_CAST:
1523     return legalizeAddrSpaceCast(MI, MRI, B);
1524   case TargetOpcode::G_FRINT:
1525     return legalizeFrint(MI, MRI, B);
1526   case TargetOpcode::G_FCEIL:
1527     return legalizeFceil(MI, MRI, B);
1528   case TargetOpcode::G_INTRINSIC_TRUNC:
1529     return legalizeIntrinsicTrunc(MI, MRI, B);
1530   case TargetOpcode::G_SITOFP:
1531     return legalizeITOFP(MI, MRI, B, true);
1532   case TargetOpcode::G_UITOFP:
1533     return legalizeITOFP(MI, MRI, B, false);
1534   case TargetOpcode::G_FPTOSI:
1535     return legalizeFPTOI(MI, MRI, B, true);
1536   case TargetOpcode::G_FPTOUI:
1537     return legalizeFPTOI(MI, MRI, B, false);
1538   case TargetOpcode::G_FMINNUM:
1539   case TargetOpcode::G_FMAXNUM:
1540   case TargetOpcode::G_FMINNUM_IEEE:
1541   case TargetOpcode::G_FMAXNUM_IEEE:
1542     return legalizeMinNumMaxNum(Helper, MI);
1543   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1544     return legalizeExtractVectorElt(MI, MRI, B);
1545   case TargetOpcode::G_INSERT_VECTOR_ELT:
1546     return legalizeInsertVectorElt(MI, MRI, B);
1547   case TargetOpcode::G_SHUFFLE_VECTOR:
1548     return legalizeShuffleVector(MI, MRI, B);
1549   case TargetOpcode::G_FSIN:
1550   case TargetOpcode::G_FCOS:
1551     return legalizeSinCos(MI, MRI, B);
1552   case TargetOpcode::G_GLOBAL_VALUE:
1553     return legalizeGlobalValue(MI, MRI, B);
1554   case TargetOpcode::G_LOAD:
1555     return legalizeLoad(MI, MRI, B, Observer);
1556   case TargetOpcode::G_FMAD:
1557     return legalizeFMad(MI, MRI, B);
1558   case TargetOpcode::G_FDIV:
1559     return legalizeFDIV(MI, MRI, B);
1560   case TargetOpcode::G_UDIV:
1561   case TargetOpcode::G_UREM:
1562     return legalizeUDIV_UREM(MI, MRI, B);
1563   case TargetOpcode::G_SDIV:
1564   case TargetOpcode::G_SREM:
1565     return legalizeSDIV_SREM(MI, MRI, B);
1566   case TargetOpcode::G_ATOMIC_CMPXCHG:
1567     return legalizeAtomicCmpXChg(MI, MRI, B);
1568   case TargetOpcode::G_FLOG:
1569     return legalizeFlog(MI, B, numbers::ln2f);
1570   case TargetOpcode::G_FLOG10:
1571     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1572   case TargetOpcode::G_FEXP:
1573     return legalizeFExp(MI, B);
1574   case TargetOpcode::G_FPOW:
1575     return legalizeFPow(MI, B);
1576   case TargetOpcode::G_FFLOOR:
1577     return legalizeFFloor(MI, MRI, B);
1578   case TargetOpcode::G_BUILD_VECTOR:
1579     return legalizeBuildVector(MI, MRI, B);
1580   default:
1581     return false;
1582   }
1583 
1584   llvm_unreachable("expected switch to return");
1585 }
1586 
1587 Register AMDGPULegalizerInfo::getSegmentAperture(
1588   unsigned AS,
1589   MachineRegisterInfo &MRI,
1590   MachineIRBuilder &B) const {
1591   MachineFunction &MF = B.getMF();
1592   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1593   const LLT S32 = LLT::scalar(32);
1594 
1595   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1596 
1597   if (ST.hasApertureRegs()) {
1598     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1599     // getreg.
1600     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1601         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1602         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1603     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1604         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1605         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1606     unsigned Encoding =
1607         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1608         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1609         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1610 
1611     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1612 
1613     B.buildInstr(AMDGPU::S_GETREG_B32)
1614       .addDef(GetReg)
1615       .addImm(Encoding);
1616     MRI.setType(GetReg, S32);
1617 
1618     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1619     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1620   }
1621 
1622   Register QueuePtr = MRI.createGenericVirtualRegister(
1623     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1624 
1625   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1626   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1627     return Register();
1628 
1629   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1630   // private_segment_aperture_base_hi.
1631   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1632 
1633   // TODO: can we be smarter about machine pointer info?
1634   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1635   MachineMemOperand *MMO = MF.getMachineMemOperand(
1636       PtrInfo,
1637       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1638           MachineMemOperand::MOInvariant,
1639       4, commonAlignment(Align(64), StructOffset));
1640 
1641   Register LoadAddr;
1642 
1643   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1644   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1645 }
1646 
1647 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1648   MachineInstr &MI, MachineRegisterInfo &MRI,
1649   MachineIRBuilder &B) const {
1650   MachineFunction &MF = B.getMF();
1651 
1652   const LLT S32 = LLT::scalar(32);
1653   Register Dst = MI.getOperand(0).getReg();
1654   Register Src = MI.getOperand(1).getReg();
1655 
1656   LLT DstTy = MRI.getType(Dst);
1657   LLT SrcTy = MRI.getType(Src);
1658   unsigned DestAS = DstTy.getAddressSpace();
1659   unsigned SrcAS = SrcTy.getAddressSpace();
1660 
1661   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1662   // vector element.
1663   assert(!DstTy.isVector());
1664 
1665   const AMDGPUTargetMachine &TM
1666     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1667 
1668   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1669   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1670     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1671     return true;
1672   }
1673 
1674   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1675     // Truncate.
1676     B.buildExtract(Dst, Src, 0);
1677     MI.eraseFromParent();
1678     return true;
1679   }
1680 
1681   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1682     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1683     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1684 
1685     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1686     // another. Merge operands are required to be the same type, but creating an
1687     // extra ptrtoint would be kind of pointless.
1688     auto HighAddr = B.buildConstant(
1689       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1690     B.buildMerge(Dst, {Src, HighAddr});
1691     MI.eraseFromParent();
1692     return true;
1693   }
1694 
1695   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1696     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1697            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1698     unsigned NullVal = TM.getNullPointerValue(DestAS);
1699 
1700     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1701     auto FlatNull = B.buildConstant(SrcTy, 0);
1702 
1703     // Extract low 32-bits of the pointer.
1704     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1705 
1706     auto CmpRes =
1707         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1708     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1709 
1710     MI.eraseFromParent();
1711     return true;
1712   }
1713 
1714   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1715     return false;
1716 
1717   if (!ST.hasFlatAddressSpace())
1718     return false;
1719 
1720   auto SegmentNull =
1721       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1722   auto FlatNull =
1723       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1724 
1725   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1726   if (!ApertureReg.isValid())
1727     return false;
1728 
1729   auto CmpRes =
1730       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1731 
1732   // Coerce the type of the low half of the result so we can use merge_values.
1733   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1734 
1735   // TODO: Should we allow mismatched types but matching sizes in merges to
1736   // avoid the ptrtoint?
1737   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1738   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1739 
1740   MI.eraseFromParent();
1741   return true;
1742 }
1743 
1744 bool AMDGPULegalizerInfo::legalizeFrint(
1745   MachineInstr &MI, MachineRegisterInfo &MRI,
1746   MachineIRBuilder &B) const {
1747   Register Src = MI.getOperand(1).getReg();
1748   LLT Ty = MRI.getType(Src);
1749   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1750 
1751   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1752   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1753 
1754   auto C1 = B.buildFConstant(Ty, C1Val);
1755   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1756 
1757   // TODO: Should this propagate fast-math-flags?
1758   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1759   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1760 
1761   auto C2 = B.buildFConstant(Ty, C2Val);
1762   auto Fabs = B.buildFAbs(Ty, Src);
1763 
1764   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1765   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1766   MI.eraseFromParent();
1767   return true;
1768 }
1769 
1770 bool AMDGPULegalizerInfo::legalizeFceil(
1771   MachineInstr &MI, MachineRegisterInfo &MRI,
1772   MachineIRBuilder &B) const {
1773 
1774   const LLT S1 = LLT::scalar(1);
1775   const LLT S64 = LLT::scalar(64);
1776 
1777   Register Src = MI.getOperand(1).getReg();
1778   assert(MRI.getType(Src) == S64);
1779 
1780   // result = trunc(src)
1781   // if (src > 0.0 && src != result)
1782   //   result += 1.0
1783 
1784   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1785 
1786   const auto Zero = B.buildFConstant(S64, 0.0);
1787   const auto One = B.buildFConstant(S64, 1.0);
1788   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1789   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1790   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1791   auto Add = B.buildSelect(S64, And, One, Zero);
1792 
1793   // TODO: Should this propagate fast-math-flags?
1794   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1795   return true;
1796 }
1797 
1798 static MachineInstrBuilder extractF64Exponent(Register Hi,
1799                                               MachineIRBuilder &B) {
1800   const unsigned FractBits = 52;
1801   const unsigned ExpBits = 11;
1802   LLT S32 = LLT::scalar(32);
1803 
1804   auto Const0 = B.buildConstant(S32, FractBits - 32);
1805   auto Const1 = B.buildConstant(S32, ExpBits);
1806 
1807   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1808     .addUse(Hi)
1809     .addUse(Const0.getReg(0))
1810     .addUse(Const1.getReg(0));
1811 
1812   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B) const {
1818   const LLT S1 = LLT::scalar(1);
1819   const LLT S32 = LLT::scalar(32);
1820   const LLT S64 = LLT::scalar(64);
1821 
1822   Register Src = MI.getOperand(1).getReg();
1823   assert(MRI.getType(Src) == S64);
1824 
1825   // TODO: Should this use extract since the low half is unused?
1826   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1827   Register Hi = Unmerge.getReg(1);
1828 
1829   // Extract the upper half, since this is where we will find the sign and
1830   // exponent.
1831   auto Exp = extractF64Exponent(Hi, B);
1832 
1833   const unsigned FractBits = 52;
1834 
1835   // Extract the sign bit.
1836   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1837   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1838 
1839   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1840 
1841   const auto Zero32 = B.buildConstant(S32, 0);
1842 
1843   // Extend back to 64-bits.
1844   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1845 
1846   auto Shr = B.buildAShr(S64, FractMask, Exp);
1847   auto Not = B.buildNot(S64, Shr);
1848   auto Tmp0 = B.buildAnd(S64, Src, Not);
1849   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1850 
1851   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1852   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1853 
1854   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1855   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1856   MI.eraseFromParent();
1857   return true;
1858 }
1859 
1860 bool AMDGPULegalizerInfo::legalizeITOFP(
1861   MachineInstr &MI, MachineRegisterInfo &MRI,
1862   MachineIRBuilder &B, bool Signed) const {
1863 
1864   Register Dst = MI.getOperand(0).getReg();
1865   Register Src = MI.getOperand(1).getReg();
1866 
1867   const LLT S64 = LLT::scalar(64);
1868   const LLT S32 = LLT::scalar(32);
1869 
1870   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1871 
1872   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1873 
1874   auto CvtHi = Signed ?
1875     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1876     B.buildUITOFP(S64, Unmerge.getReg(1));
1877 
1878   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1879 
1880   auto ThirtyTwo = B.buildConstant(S32, 32);
1881   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1882     .addUse(CvtHi.getReg(0))
1883     .addUse(ThirtyTwo.getReg(0));
1884 
1885   // TODO: Should this propagate fast-math-flags?
1886   B.buildFAdd(Dst, LdExp, CvtLo);
1887   MI.eraseFromParent();
1888   return true;
1889 }
1890 
1891 // TODO: Copied from DAG implementation. Verify logic and document how this
1892 // actually works.
1893 bool AMDGPULegalizerInfo::legalizeFPTOI(
1894   MachineInstr &MI, MachineRegisterInfo &MRI,
1895   MachineIRBuilder &B, bool Signed) const {
1896 
1897   Register Dst = MI.getOperand(0).getReg();
1898   Register Src = MI.getOperand(1).getReg();
1899 
1900   const LLT S64 = LLT::scalar(64);
1901   const LLT S32 = LLT::scalar(32);
1902 
1903   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1904 
1905   unsigned Flags = MI.getFlags();
1906 
1907   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1908   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1909   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1910 
1911   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1912   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1913   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1914 
1915   auto Hi = Signed ?
1916     B.buildFPTOSI(S32, FloorMul) :
1917     B.buildFPTOUI(S32, FloorMul);
1918   auto Lo = B.buildFPTOUI(S32, Fma);
1919 
1920   B.buildMerge(Dst, { Lo, Hi });
1921   MI.eraseFromParent();
1922 
1923   return true;
1924 }
1925 
1926 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1927                                                MachineInstr &MI) const {
1928   MachineFunction &MF = Helper.MIRBuilder.getMF();
1929   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1930 
1931   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1932                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1933 
1934   // With ieee_mode disabled, the instructions have the correct behavior
1935   // already for G_FMINNUM/G_FMAXNUM
1936   if (!MFI->getMode().IEEE)
1937     return !IsIEEEOp;
1938 
1939   if (IsIEEEOp)
1940     return true;
1941 
1942   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   // TODO: Should move some of this into LegalizerHelper.
1949 
1950   // TODO: Promote dynamic indexing of s16 to s32
1951 
1952   // FIXME: Artifact combiner probably should have replaced the truncated
1953   // constant before this, so we shouldn't need
1954   // getConstantVRegValWithLookThrough.
1955   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1956     MI.getOperand(2).getReg(), MRI);
1957   if (!IdxVal) // Dynamic case will be selected to register indexing.
1958     return true;
1959 
1960   Register Dst = MI.getOperand(0).getReg();
1961   Register Vec = MI.getOperand(1).getReg();
1962 
1963   LLT VecTy = MRI.getType(Vec);
1964   LLT EltTy = VecTy.getElementType();
1965   assert(EltTy == MRI.getType(Dst));
1966 
1967   if (IdxVal->Value < VecTy.getNumElements())
1968     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1969   else
1970     B.buildUndef(Dst);
1971 
1972   MI.eraseFromParent();
1973   return true;
1974 }
1975 
1976 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1977   MachineInstr &MI, MachineRegisterInfo &MRI,
1978   MachineIRBuilder &B) const {
1979   // TODO: Should move some of this into LegalizerHelper.
1980 
1981   // TODO: Promote dynamic indexing of s16 to s32
1982 
1983   // FIXME: Artifact combiner probably should have replaced the truncated
1984   // constant before this, so we shouldn't need
1985   // getConstantVRegValWithLookThrough.
1986   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1987     MI.getOperand(3).getReg(), MRI);
1988   if (!IdxVal) // Dynamic case will be selected to register indexing.
1989     return true;
1990 
1991   Register Dst = MI.getOperand(0).getReg();
1992   Register Vec = MI.getOperand(1).getReg();
1993   Register Ins = MI.getOperand(2).getReg();
1994 
1995   LLT VecTy = MRI.getType(Vec);
1996   LLT EltTy = VecTy.getElementType();
1997   assert(EltTy == MRI.getType(Ins));
1998 
1999   if (IdxVal->Value < VecTy.getNumElements())
2000     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2001   else
2002     B.buildUndef(Dst);
2003 
2004   MI.eraseFromParent();
2005   return true;
2006 }
2007 
2008 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2009   MachineInstr &MI, MachineRegisterInfo &MRI,
2010   MachineIRBuilder &B) const {
2011   const LLT V2S16 = LLT::vector(2, 16);
2012 
2013   Register Dst = MI.getOperand(0).getReg();
2014   Register Src0 = MI.getOperand(1).getReg();
2015   LLT DstTy = MRI.getType(Dst);
2016   LLT SrcTy = MRI.getType(Src0);
2017 
2018   if (SrcTy == V2S16 && DstTy == V2S16 &&
2019       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2020     return true;
2021 
2022   MachineIRBuilder HelperBuilder(MI);
2023   GISelObserverWrapper DummyObserver;
2024   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2025   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2026 }
2027 
2028 bool AMDGPULegalizerInfo::legalizeSinCos(
2029   MachineInstr &MI, MachineRegisterInfo &MRI,
2030   MachineIRBuilder &B) const {
2031 
2032   Register DstReg = MI.getOperand(0).getReg();
2033   Register SrcReg = MI.getOperand(1).getReg();
2034   LLT Ty = MRI.getType(DstReg);
2035   unsigned Flags = MI.getFlags();
2036 
2037   Register TrigVal;
2038   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2039   if (ST.hasTrigReducedRange()) {
2040     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2041     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2042       .addUse(MulVal.getReg(0))
2043       .setMIFlags(Flags).getReg(0);
2044   } else
2045     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2046 
2047   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2048     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2049   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2050     .addUse(TrigVal)
2051     .setMIFlags(Flags);
2052   MI.eraseFromParent();
2053   return true;
2054 }
2055 
2056 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2057                                                   MachineIRBuilder &B,
2058                                                   const GlobalValue *GV,
2059                                                   int64_t Offset,
2060                                                   unsigned GAFlags) const {
2061   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2062   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2063   // to the following code sequence:
2064   //
2065   // For constant address space:
2066   //   s_getpc_b64 s[0:1]
2067   //   s_add_u32 s0, s0, $symbol
2068   //   s_addc_u32 s1, s1, 0
2069   //
2070   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2071   //   a fixup or relocation is emitted to replace $symbol with a literal
2072   //   constant, which is a pc-relative offset from the encoding of the $symbol
2073   //   operand to the global variable.
2074   //
2075   // For global address space:
2076   //   s_getpc_b64 s[0:1]
2077   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2078   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2079   //
2080   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2081   //   fixups or relocations are emitted to replace $symbol@*@lo and
2082   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2083   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2084   //   operand to the global variable.
2085   //
2086   // What we want here is an offset from the value returned by s_getpc
2087   // (which is the address of the s_add_u32 instruction) to the global
2088   // variable, but since the encoding of $symbol starts 4 bytes after the start
2089   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2090   // small. This requires us to add 4 to the global variable offset in order to
2091   // compute the correct address.
2092 
2093   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2094 
2095   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2096     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2097 
2098   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2099     .addDef(PCReg);
2100 
2101   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2102   if (GAFlags == SIInstrInfo::MO_NONE)
2103     MIB.addImm(0);
2104   else
2105     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2106 
2107   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2108 
2109   if (PtrTy.getSizeInBits() == 32)
2110     B.buildExtract(DstReg, PCReg, 0);
2111   return true;
2112  }
2113 
2114 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2115   MachineInstr &MI, MachineRegisterInfo &MRI,
2116   MachineIRBuilder &B) const {
2117   Register DstReg = MI.getOperand(0).getReg();
2118   LLT Ty = MRI.getType(DstReg);
2119   unsigned AS = Ty.getAddressSpace();
2120 
2121   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2122   MachineFunction &MF = B.getMF();
2123   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2124 
2125   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2126     if (!MFI->isEntryFunction()) {
2127       const Function &Fn = MF.getFunction();
2128       DiagnosticInfoUnsupported BadLDSDecl(
2129         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2130         DS_Warning);
2131       Fn.getContext().diagnose(BadLDSDecl);
2132 
2133       // We currently don't have a way to correctly allocate LDS objects that
2134       // aren't directly associated with a kernel. We do force inlining of
2135       // functions that use local objects. However, if these dead functions are
2136       // not eliminated, we don't want a compile time error. Just emit a warning
2137       // and a trap, since there should be no callable path here.
2138       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2139       B.buildUndef(DstReg);
2140       MI.eraseFromParent();
2141       return true;
2142     }
2143 
2144     // TODO: We could emit code to handle the initialization somewhere.
2145     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2146       const SITargetLowering *TLI = ST.getTargetLowering();
2147       if (!TLI->shouldUseLDSConstAddress(GV)) {
2148         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2149         return true; // Leave in place;
2150       }
2151 
2152       B.buildConstant(
2153           DstReg,
2154           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2155       MI.eraseFromParent();
2156       return true;
2157     }
2158 
2159     const Function &Fn = MF.getFunction();
2160     DiagnosticInfoUnsupported BadInit(
2161       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2162     Fn.getContext().diagnose(BadInit);
2163     return true;
2164   }
2165 
2166   const SITargetLowering *TLI = ST.getTargetLowering();
2167 
2168   if (TLI->shouldEmitFixup(GV)) {
2169     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2170     MI.eraseFromParent();
2171     return true;
2172   }
2173 
2174   if (TLI->shouldEmitPCReloc(GV)) {
2175     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2176     MI.eraseFromParent();
2177     return true;
2178   }
2179 
2180   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2181   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2182 
2183   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2184       MachinePointerInfo::getGOT(MF),
2185       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2186           MachineMemOperand::MOInvariant,
2187       8 /*Size*/, Align(8));
2188 
2189   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2190 
2191   if (Ty.getSizeInBits() == 32) {
2192     // Truncate if this is a 32-bit constant adrdess.
2193     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2194     B.buildExtract(DstReg, Load, 0);
2195   } else
2196     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2197 
2198   MI.eraseFromParent();
2199   return true;
2200 }
2201 
2202 bool AMDGPULegalizerInfo::legalizeLoad(
2203   MachineInstr &MI, MachineRegisterInfo &MRI,
2204   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2205   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2206   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2207   Observer.changingInstr(MI);
2208   MI.getOperand(1).setReg(Cast.getReg(0));
2209   Observer.changedInstr(MI);
2210   return true;
2211 }
2212 
2213 bool AMDGPULegalizerInfo::legalizeFMad(
2214   MachineInstr &MI, MachineRegisterInfo &MRI,
2215   MachineIRBuilder &B) const {
2216   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2217   assert(Ty.isScalar());
2218 
2219   MachineFunction &MF = B.getMF();
2220   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2221 
2222   // TODO: Always legal with future ftz flag.
2223   // FIXME: Do we need just output?
2224   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2225     return true;
2226   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2227     return true;
2228 
2229   MachineIRBuilder HelperBuilder(MI);
2230   GISelObserverWrapper DummyObserver;
2231   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2232   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2233 }
2234 
2235 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2236   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2237   Register DstReg = MI.getOperand(0).getReg();
2238   Register PtrReg = MI.getOperand(1).getReg();
2239   Register CmpVal = MI.getOperand(2).getReg();
2240   Register NewVal = MI.getOperand(3).getReg();
2241 
2242   assert(SITargetLowering::isFlatGlobalAddrSpace(
2243            MRI.getType(PtrReg).getAddressSpace()) &&
2244          "this should not have been custom lowered");
2245 
2246   LLT ValTy = MRI.getType(CmpVal);
2247   LLT VecTy = LLT::vector(2, ValTy);
2248 
2249   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2250 
2251   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2252     .addDef(DstReg)
2253     .addUse(PtrReg)
2254     .addUse(PackedVal)
2255     .setMemRefs(MI.memoperands());
2256 
2257   MI.eraseFromParent();
2258   return true;
2259 }
2260 
2261 bool AMDGPULegalizerInfo::legalizeFlog(
2262   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2263   Register Dst = MI.getOperand(0).getReg();
2264   Register Src = MI.getOperand(1).getReg();
2265   LLT Ty = B.getMRI()->getType(Dst);
2266   unsigned Flags = MI.getFlags();
2267 
2268   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2269   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2270 
2271   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2272   MI.eraseFromParent();
2273   return true;
2274 }
2275 
2276 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2277                                        MachineIRBuilder &B) const {
2278   Register Dst = MI.getOperand(0).getReg();
2279   Register Src = MI.getOperand(1).getReg();
2280   unsigned Flags = MI.getFlags();
2281   LLT Ty = B.getMRI()->getType(Dst);
2282 
2283   auto K = B.buildFConstant(Ty, numbers::log2e);
2284   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2285   B.buildFExp2(Dst, Mul, Flags);
2286   MI.eraseFromParent();
2287   return true;
2288 }
2289 
2290 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2291                                        MachineIRBuilder &B) const {
2292   Register Dst = MI.getOperand(0).getReg();
2293   Register Src0 = MI.getOperand(1).getReg();
2294   Register Src1 = MI.getOperand(2).getReg();
2295   unsigned Flags = MI.getFlags();
2296   LLT Ty = B.getMRI()->getType(Dst);
2297   const LLT S16 = LLT::scalar(16);
2298   const LLT S32 = LLT::scalar(32);
2299 
2300   if (Ty == S32) {
2301     auto Log = B.buildFLog2(S32, Src0, Flags);
2302     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2303       .addUse(Log.getReg(0))
2304       .addUse(Src1)
2305       .setMIFlags(Flags);
2306     B.buildFExp2(Dst, Mul, Flags);
2307   } else if (Ty == S16) {
2308     // There's no f16 fmul_legacy, so we need to convert for it.
2309     auto Log = B.buildFLog2(S16, Src0, Flags);
2310     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2311     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2312     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2313       .addUse(Ext0.getReg(0))
2314       .addUse(Ext1.getReg(0))
2315       .setMIFlags(Flags);
2316 
2317     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2318   } else
2319     return false;
2320 
2321   MI.eraseFromParent();
2322   return true;
2323 }
2324 
2325 // Find a source register, ignoring any possible source modifiers.
2326 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2327   Register ModSrc = OrigSrc;
2328   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2329     ModSrc = SrcFNeg->getOperand(1).getReg();
2330     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2331       ModSrc = SrcFAbs->getOperand(1).getReg();
2332   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2333     ModSrc = SrcFAbs->getOperand(1).getReg();
2334   return ModSrc;
2335 }
2336 
2337 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2338                                          MachineRegisterInfo &MRI,
2339                                          MachineIRBuilder &B) const {
2340 
2341   const LLT S1 = LLT::scalar(1);
2342   const LLT S64 = LLT::scalar(64);
2343   Register Dst = MI.getOperand(0).getReg();
2344   Register OrigSrc = MI.getOperand(1).getReg();
2345   unsigned Flags = MI.getFlags();
2346   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2347          "this should not have been custom lowered");
2348 
2349   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2350   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2351   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2352   // V_FRACT bug is:
2353   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2354   //
2355   // Convert floor(x) to (x - fract(x))
2356 
2357   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2358     .addUse(OrigSrc)
2359     .setMIFlags(Flags);
2360 
2361   // Give source modifier matching some assistance before obscuring a foldable
2362   // pattern.
2363 
2364   // TODO: We can avoid the neg on the fract? The input sign to fract
2365   // shouldn't matter?
2366   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2367 
2368   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2369 
2370   Register Min = MRI.createGenericVirtualRegister(S64);
2371 
2372   // We don't need to concern ourselves with the snan handling difference, so
2373   // use the one which will directly select.
2374   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2375   if (MFI->getMode().IEEE)
2376     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2377   else
2378     B.buildFMinNum(Min, Fract, Const, Flags);
2379 
2380   Register CorrectedFract = Min;
2381   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2382     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2383     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2384   }
2385 
2386   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2387   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2388 
2389   MI.eraseFromParent();
2390   return true;
2391 }
2392 
2393 // Turn an illegal packed v2s16 build vector into bit operations.
2394 // TODO: This should probably be a bitcast action in LegalizerHelper.
2395 bool AMDGPULegalizerInfo::legalizeBuildVector(
2396   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2397   Register Dst = MI.getOperand(0).getReg();
2398   const LLT S32 = LLT::scalar(32);
2399   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2400 
2401   Register Src0 = MI.getOperand(1).getReg();
2402   Register Src1 = MI.getOperand(2).getReg();
2403   assert(MRI.getType(Src0) == LLT::scalar(16));
2404 
2405   auto Merge = B.buildMerge(S32, {Src0, Src1});
2406   B.buildBitcast(Dst, Merge);
2407 
2408   MI.eraseFromParent();
2409   return true;
2410 }
2411 
2412 // Return the use branch instruction, otherwise null if the usage is invalid.
2413 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2414                                        MachineRegisterInfo &MRI,
2415                                        MachineInstr *&Br,
2416                                        MachineBasicBlock *&UncondBrTarget) {
2417   Register CondDef = MI.getOperand(0).getReg();
2418   if (!MRI.hasOneNonDBGUse(CondDef))
2419     return nullptr;
2420 
2421   MachineBasicBlock *Parent = MI.getParent();
2422   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2423   if (UseMI.getParent() != Parent ||
2424       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2425     return nullptr;
2426 
2427   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2428   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2429   if (Next == Parent->end()) {
2430     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2431     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2432       return nullptr;
2433     UncondBrTarget = &*NextMBB;
2434   } else {
2435     if (Next->getOpcode() != AMDGPU::G_BR)
2436       return nullptr;
2437     Br = &*Next;
2438     UncondBrTarget = Br->getOperand(0).getMBB();
2439   }
2440 
2441   return &UseMI;
2442 }
2443 
2444 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2445                                                MachineRegisterInfo &MRI,
2446                                                Register LiveIn,
2447                                                Register PhyReg) const {
2448   assert(PhyReg.isPhysical() && "Physical register expected");
2449 
2450   // Insert the live-in copy, if required, by defining destination virtual
2451   // register.
2452   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2453   if (!MRI.getVRegDef(LiveIn)) {
2454     // FIXME: Should have scoped insert pt
2455     MachineBasicBlock &OrigInsBB = B.getMBB();
2456     auto OrigInsPt = B.getInsertPt();
2457 
2458     MachineBasicBlock &EntryMBB = B.getMF().front();
2459     EntryMBB.addLiveIn(PhyReg);
2460     B.setInsertPt(EntryMBB, EntryMBB.begin());
2461     B.buildCopy(LiveIn, PhyReg);
2462 
2463     B.setInsertPt(OrigInsBB, OrigInsPt);
2464   }
2465 
2466   return LiveIn;
2467 }
2468 
2469 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2470                                                 MachineRegisterInfo &MRI,
2471                                                 Register PhyReg, LLT Ty,
2472                                                 bool InsertLiveInCopy) const {
2473   assert(PhyReg.isPhysical() && "Physical register expected");
2474 
2475   // Get or create virtual live-in regester
2476   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2477   if (!LiveIn) {
2478     LiveIn = MRI.createGenericVirtualRegister(Ty);
2479     MRI.addLiveIn(PhyReg, LiveIn);
2480   }
2481 
2482   // When the actual true copy required is from virtual register to physical
2483   // register (to be inserted later), live-in copy insertion from physical
2484   // to register virtual register is not required
2485   if (!InsertLiveInCopy)
2486     return LiveIn;
2487 
2488   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2489 }
2490 
2491 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2492     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2493   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2494   const ArgDescriptor *Arg;
2495   const TargetRegisterClass *RC;
2496   LLT ArgTy;
2497   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2498   if (!Arg) {
2499     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2500     return nullptr;
2501   }
2502   return Arg;
2503 }
2504 
2505 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2506                                          const ArgDescriptor *Arg) const {
2507   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2508     return false; // TODO: Handle these
2509 
2510   Register SrcReg = Arg->getRegister();
2511   assert(SrcReg.isPhysical() && "Physical register expected");
2512   assert(DstReg.isVirtual() && "Virtual register expected");
2513 
2514   MachineRegisterInfo &MRI = *B.getMRI();
2515 
2516   LLT Ty = MRI.getType(DstReg);
2517   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2518 
2519   if (Arg->isMasked()) {
2520     // TODO: Should we try to emit this once in the entry block?
2521     const LLT S32 = LLT::scalar(32);
2522     const unsigned Mask = Arg->getMask();
2523     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2524 
2525     Register AndMaskSrc = LiveIn;
2526 
2527     if (Shift != 0) {
2528       auto ShiftAmt = B.buildConstant(S32, Shift);
2529       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2530     }
2531 
2532     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2533   } else {
2534     B.buildCopy(DstReg, LiveIn);
2535   }
2536 
2537   return true;
2538 }
2539 
2540 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2541     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2542     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2543 
2544   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2545   if (!Arg)
2546     return false;
2547 
2548   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2549     return false;
2550 
2551   MI.eraseFromParent();
2552   return true;
2553 }
2554 
2555 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2556                                        MachineRegisterInfo &MRI,
2557                                        MachineIRBuilder &B) const {
2558   Register Dst = MI.getOperand(0).getReg();
2559   LLT DstTy = MRI.getType(Dst);
2560   LLT S16 = LLT::scalar(16);
2561   LLT S32 = LLT::scalar(32);
2562   LLT S64 = LLT::scalar(64);
2563 
2564   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2565     return true;
2566 
2567   if (DstTy == S16)
2568     return legalizeFDIV16(MI, MRI, B);
2569   if (DstTy == S32)
2570     return legalizeFDIV32(MI, MRI, B);
2571   if (DstTy == S64)
2572     return legalizeFDIV64(MI, MRI, B);
2573 
2574   return false;
2575 }
2576 
2577 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2578                                                   Register DstReg,
2579                                                   Register X,
2580                                                   Register Y,
2581                                                   bool IsDiv) const {
2582   const LLT S1 = LLT::scalar(1);
2583   const LLT S32 = LLT::scalar(32);
2584 
2585   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2586   // algorithm used here.
2587 
2588   // Initial estimate of inv(y).
2589   auto FloatY = B.buildUITOFP(S32, Y);
2590   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2591   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2592   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2593   auto Z = B.buildFPTOUI(S32, ScaledY);
2594 
2595   // One round of UNR.
2596   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2597   auto NegYZ = B.buildMul(S32, NegY, Z);
2598   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2599 
2600   // Quotient/remainder estimate.
2601   auto Q = B.buildUMulH(S32, X, Z);
2602   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2603 
2604   // First quotient/remainder refinement.
2605   auto One = B.buildConstant(S32, 1);
2606   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2607   if (IsDiv)
2608     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2609   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2610 
2611   // Second quotient/remainder refinement.
2612   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2613   if (IsDiv)
2614     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2615   else
2616     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2617 }
2618 
2619 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2620                                               MachineRegisterInfo &MRI,
2621                                               MachineIRBuilder &B) const {
2622   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2623   Register DstReg = MI.getOperand(0).getReg();
2624   Register Num = MI.getOperand(1).getReg();
2625   Register Den = MI.getOperand(2).getReg();
2626   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2627   MI.eraseFromParent();
2628   return true;
2629 }
2630 
2631 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2632 //
2633 // Return lo, hi of result
2634 //
2635 // %cvt.lo = G_UITOFP Val.lo
2636 // %cvt.hi = G_UITOFP Val.hi
2637 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2638 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2639 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2640 // %mul2 = G_FMUL %mul1, 2**(-32)
2641 // %trunc = G_INTRINSIC_TRUNC %mul2
2642 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2643 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2644 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2645                                                        Register Val) {
2646   const LLT S32 = LLT::scalar(32);
2647   auto Unmerge = B.buildUnmerge(S32, Val);
2648 
2649   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2650   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2651 
2652   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2653                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2654 
2655   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2656   auto Mul1 =
2657       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2658 
2659   // 2**(-32)
2660   auto Mul2 =
2661       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2662   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2663 
2664   // -(2**32)
2665   auto Mad2 = B.buildFMAD(S32, Trunc,
2666                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2667 
2668   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2669   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2670 
2671   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2672 }
2673 
2674 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2675                                                   Register DstReg,
2676                                                   Register Numer,
2677                                                   Register Denom,
2678                                                   bool IsDiv) const {
2679   const LLT S32 = LLT::scalar(32);
2680   const LLT S64 = LLT::scalar(64);
2681   const LLT S1 = LLT::scalar(1);
2682   Register RcpLo, RcpHi;
2683 
2684   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2685 
2686   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2687 
2688   auto Zero64 = B.buildConstant(S64, 0);
2689   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2690 
2691   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2692   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2693 
2694   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2695   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2696   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2697 
2698   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2699   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2700   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2701   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2702 
2703   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2704   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2705   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2706   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2707   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2708 
2709   auto Zero32 = B.buildConstant(S32, 0);
2710   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2711   auto Add2_HiC =
2712       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2713   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2714   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2715 
2716   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2717   Register NumerLo = UnmergeNumer.getReg(0);
2718   Register NumerHi = UnmergeNumer.getReg(1);
2719 
2720   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2721   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2722   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2723   Register Mul3_Lo = UnmergeMul3.getReg(0);
2724   Register Mul3_Hi = UnmergeMul3.getReg(1);
2725   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2726   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2727   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2728   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2729 
2730   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2731   Register DenomLo = UnmergeDenom.getReg(0);
2732   Register DenomHi = UnmergeDenom.getReg(1);
2733 
2734   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2735   auto C1 = B.buildSExt(S32, CmpHi);
2736 
2737   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2738   auto C2 = B.buildSExt(S32, CmpLo);
2739 
2740   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2741   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2742 
2743   // TODO: Here and below portions of the code can be enclosed into if/endif.
2744   // Currently control flow is unconditional and we have 4 selects after
2745   // potential endif to substitute PHIs.
2746 
2747   // if C3 != 0 ...
2748   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2749   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2750   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2751   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2752 
2753   auto One64 = B.buildConstant(S64, 1);
2754   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2755 
2756   auto C4 =
2757       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2758   auto C5 =
2759       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2760   auto C6 = B.buildSelect(
2761       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2762 
2763   // if (C6 != 0)
2764   auto Add4 = B.buildAdd(S64, Add3, One64);
2765   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2766 
2767   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2768   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2769   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2770 
2771   // endif C6
2772   // endif C3
2773 
2774   if (IsDiv) {
2775     auto Sel1 = B.buildSelect(
2776         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2777     B.buildSelect(DstReg,
2778                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2779   } else {
2780     auto Sel2 = B.buildSelect(
2781         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2782     B.buildSelect(DstReg,
2783                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2784   }
2785 }
2786 
2787 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2788                                             MachineRegisterInfo &MRI,
2789                                             MachineIRBuilder &B) const {
2790   const LLT S64 = LLT::scalar(64);
2791   const LLT S32 = LLT::scalar(32);
2792   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2793   Register DstReg = MI.getOperand(0).getReg();
2794   Register Num = MI.getOperand(1).getReg();
2795   Register Den = MI.getOperand(2).getReg();
2796   LLT Ty = MRI.getType(DstReg);
2797 
2798   if (Ty == S32)
2799     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2800   else if (Ty == S64)
2801     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2802   else
2803     return false;
2804 
2805   MI.eraseFromParent();
2806   return true;
2807 
2808 }
2809 
2810 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2811                                             MachineRegisterInfo &MRI,
2812                                             MachineIRBuilder &B) const {
2813   const LLT S64 = LLT::scalar(64);
2814   const LLT S32 = LLT::scalar(32);
2815 
2816   Register DstReg = MI.getOperand(0).getReg();
2817   const LLT Ty = MRI.getType(DstReg);
2818   if (Ty != S32 && Ty != S64)
2819     return false;
2820 
2821   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2822 
2823   Register LHS = MI.getOperand(1).getReg();
2824   Register RHS = MI.getOperand(2).getReg();
2825 
2826   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2827   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2828   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2829 
2830   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2831   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2832 
2833   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2834   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2835 
2836   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2837   if (Ty == S32)
2838     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2839   else
2840     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2841 
2842   Register Sign;
2843   if (IsDiv)
2844     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2845   else
2846     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2847 
2848   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2849   B.buildSub(DstReg, UDivRem, Sign);
2850 
2851   MI.eraseFromParent();
2852   return true;
2853 }
2854 
2855 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2856                                                  MachineRegisterInfo &MRI,
2857                                                  MachineIRBuilder &B) const {
2858   Register Res = MI.getOperand(0).getReg();
2859   Register LHS = MI.getOperand(1).getReg();
2860   Register RHS = MI.getOperand(2).getReg();
2861 
2862   uint16_t Flags = MI.getFlags();
2863 
2864   LLT ResTy = MRI.getType(Res);
2865   LLT S32 = LLT::scalar(32);
2866   LLT S64 = LLT::scalar(64);
2867 
2868   const MachineFunction &MF = B.getMF();
2869   bool Unsafe =
2870     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2871 
2872   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2873     return false;
2874 
2875   if (!Unsafe && ResTy == S32 &&
2876       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2877     return false;
2878 
2879   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2880     // 1 / x -> RCP(x)
2881     if (CLHS->isExactlyValue(1.0)) {
2882       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2883         .addUse(RHS)
2884         .setMIFlags(Flags);
2885 
2886       MI.eraseFromParent();
2887       return true;
2888     }
2889 
2890     // -1 / x -> RCP( FNEG(x) )
2891     if (CLHS->isExactlyValue(-1.0)) {
2892       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2893       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2894         .addUse(FNeg.getReg(0))
2895         .setMIFlags(Flags);
2896 
2897       MI.eraseFromParent();
2898       return true;
2899     }
2900   }
2901 
2902   // x / y -> x * (1.0 / y)
2903   if (Unsafe) {
2904     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2905       .addUse(RHS)
2906       .setMIFlags(Flags);
2907     B.buildFMul(Res, LHS, RCP, Flags);
2908 
2909     MI.eraseFromParent();
2910     return true;
2911   }
2912 
2913   return false;
2914 }
2915 
2916 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2917                                          MachineRegisterInfo &MRI,
2918                                          MachineIRBuilder &B) const {
2919   Register Res = MI.getOperand(0).getReg();
2920   Register LHS = MI.getOperand(1).getReg();
2921   Register RHS = MI.getOperand(2).getReg();
2922 
2923   uint16_t Flags = MI.getFlags();
2924 
2925   LLT S16 = LLT::scalar(16);
2926   LLT S32 = LLT::scalar(32);
2927 
2928   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2929   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2930 
2931   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2932     .addUse(RHSExt.getReg(0))
2933     .setMIFlags(Flags);
2934 
2935   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2936   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2937 
2938   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2939     .addUse(RDst.getReg(0))
2940     .addUse(RHS)
2941     .addUse(LHS)
2942     .setMIFlags(Flags);
2943 
2944   MI.eraseFromParent();
2945   return true;
2946 }
2947 
2948 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2949 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2950 static void toggleSPDenormMode(bool Enable,
2951                                MachineIRBuilder &B,
2952                                const GCNSubtarget &ST,
2953                                AMDGPU::SIModeRegisterDefaults Mode) {
2954   // Set SP denorm mode to this value.
2955   unsigned SPDenormMode =
2956     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2957 
2958   if (ST.hasDenormModeInst()) {
2959     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2960     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2961 
2962     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2963     B.buildInstr(AMDGPU::S_DENORM_MODE)
2964       .addImm(NewDenormModeValue);
2965 
2966   } else {
2967     // Select FP32 bit field in mode register.
2968     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2969                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2970                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2971 
2972     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2973       .addImm(SPDenormMode)
2974       .addImm(SPDenormModeBitField);
2975   }
2976 }
2977 
2978 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2979                                          MachineRegisterInfo &MRI,
2980                                          MachineIRBuilder &B) const {
2981   Register Res = MI.getOperand(0).getReg();
2982   Register LHS = MI.getOperand(1).getReg();
2983   Register RHS = MI.getOperand(2).getReg();
2984   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2985   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2986 
2987   uint16_t Flags = MI.getFlags();
2988 
2989   LLT S32 = LLT::scalar(32);
2990   LLT S1 = LLT::scalar(1);
2991 
2992   auto One = B.buildFConstant(S32, 1.0f);
2993 
2994   auto DenominatorScaled =
2995     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2996       .addUse(LHS)
2997       .addUse(RHS)
2998       .addImm(0)
2999       .setMIFlags(Flags);
3000   auto NumeratorScaled =
3001     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3002       .addUse(LHS)
3003       .addUse(RHS)
3004       .addImm(1)
3005       .setMIFlags(Flags);
3006 
3007   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3008     .addUse(DenominatorScaled.getReg(0))
3009     .setMIFlags(Flags);
3010   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3011 
3012   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3013   // aren't modeled as reading it.
3014   if (!Mode.allFP32Denormals())
3015     toggleSPDenormMode(true, B, ST, Mode);
3016 
3017   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3018   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3019   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3020   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3021   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3022   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3023 
3024   if (!Mode.allFP32Denormals())
3025     toggleSPDenormMode(false, B, ST, Mode);
3026 
3027   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3028     .addUse(Fma4.getReg(0))
3029     .addUse(Fma1.getReg(0))
3030     .addUse(Fma3.getReg(0))
3031     .addUse(NumeratorScaled.getReg(1))
3032     .setMIFlags(Flags);
3033 
3034   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3035     .addUse(Fmas.getReg(0))
3036     .addUse(RHS)
3037     .addUse(LHS)
3038     .setMIFlags(Flags);
3039 
3040   MI.eraseFromParent();
3041   return true;
3042 }
3043 
3044 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3045                                          MachineRegisterInfo &MRI,
3046                                          MachineIRBuilder &B) const {
3047   Register Res = MI.getOperand(0).getReg();
3048   Register LHS = MI.getOperand(1).getReg();
3049   Register RHS = MI.getOperand(2).getReg();
3050 
3051   uint16_t Flags = MI.getFlags();
3052 
3053   LLT S64 = LLT::scalar(64);
3054   LLT S1 = LLT::scalar(1);
3055 
3056   auto One = B.buildFConstant(S64, 1.0);
3057 
3058   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3059     .addUse(LHS)
3060     .addUse(RHS)
3061     .addImm(0)
3062     .setMIFlags(Flags);
3063 
3064   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3065 
3066   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3067     .addUse(DivScale0.getReg(0))
3068     .setMIFlags(Flags);
3069 
3070   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3071   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3072   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3073 
3074   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3075     .addUse(LHS)
3076     .addUse(RHS)
3077     .addImm(1)
3078     .setMIFlags(Flags);
3079 
3080   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3081   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3082   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3083 
3084   Register Scale;
3085   if (!ST.hasUsableDivScaleConditionOutput()) {
3086     // Workaround a hardware bug on SI where the condition output from div_scale
3087     // is not usable.
3088 
3089     LLT S32 = LLT::scalar(32);
3090 
3091     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3092     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3093     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3094     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3095 
3096     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3097                               Scale1Unmerge.getReg(1));
3098     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3099                               Scale0Unmerge.getReg(1));
3100     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3101   } else {
3102     Scale = DivScale1.getReg(1);
3103   }
3104 
3105   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3106     .addUse(Fma4.getReg(0))
3107     .addUse(Fma3.getReg(0))
3108     .addUse(Mul.getReg(0))
3109     .addUse(Scale)
3110     .setMIFlags(Flags);
3111 
3112   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3113     .addUse(Fmas.getReg(0))
3114     .addUse(RHS)
3115     .addUse(LHS)
3116     .setMIFlags(Flags);
3117 
3118   MI.eraseFromParent();
3119   return true;
3120 }
3121 
3122 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3123                                                  MachineRegisterInfo &MRI,
3124                                                  MachineIRBuilder &B) const {
3125   Register Res = MI.getOperand(0).getReg();
3126   Register LHS = MI.getOperand(2).getReg();
3127   Register RHS = MI.getOperand(3).getReg();
3128   uint16_t Flags = MI.getFlags();
3129 
3130   LLT S32 = LLT::scalar(32);
3131   LLT S1 = LLT::scalar(1);
3132 
3133   auto Abs = B.buildFAbs(S32, RHS, Flags);
3134   const APFloat C0Val(1.0f);
3135 
3136   auto C0 = B.buildConstant(S32, 0x6f800000);
3137   auto C1 = B.buildConstant(S32, 0x2f800000);
3138   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3139 
3140   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3141   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3142 
3143   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3144 
3145   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3146     .addUse(Mul0.getReg(0))
3147     .setMIFlags(Flags);
3148 
3149   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3150 
3151   B.buildFMul(Res, Sel, Mul1, Flags);
3152 
3153   MI.eraseFromParent();
3154   return true;
3155 }
3156 
3157 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3158                                             MachineRegisterInfo &MRI,
3159                                             MachineIRBuilder &B) const {
3160   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3161   uint64_t Offset =
3162     ST.getTargetLowering()->getImplicitParameterOffset(
3163       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3164   LLT DstTy = MRI.getType(DstReg);
3165   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3166 
3167   const ArgDescriptor *Arg;
3168   const TargetRegisterClass *RC;
3169   LLT ArgTy;
3170   std::tie(Arg, RC, ArgTy) =
3171       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3172   if (!Arg)
3173     return false;
3174 
3175   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3176   if (!loadInputValue(KernargPtrReg, B, Arg))
3177     return false;
3178 
3179   // FIXME: This should be nuw
3180   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3181   return true;
3182 }
3183 
3184 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3185                                                  MachineRegisterInfo &MRI,
3186                                                  MachineIRBuilder &B) const {
3187   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3188   if (!MFI->isEntryFunction()) {
3189     return legalizePreloadedArgIntrin(MI, MRI, B,
3190                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3191   }
3192 
3193   Register DstReg = MI.getOperand(0).getReg();
3194   if (!getImplicitArgPtr(DstReg, MRI, B))
3195     return false;
3196 
3197   MI.eraseFromParent();
3198   return true;
3199 }
3200 
3201 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3202                                               MachineRegisterInfo &MRI,
3203                                               MachineIRBuilder &B,
3204                                               unsigned AddrSpace) const {
3205   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3206   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3207   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3208   MI.eraseFromParent();
3209   return true;
3210 }
3211 
3212 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3213 // offset (the offset that is included in bounds checking and swizzling, to be
3214 // split between the instruction's voffset and immoffset fields) and soffset
3215 // (the offset that is excluded from bounds checking and swizzling, to go in
3216 // the instruction's soffset field).  This function takes the first kind of
3217 // offset and figures out how to split it between voffset and immoffset.
3218 std::tuple<Register, unsigned, unsigned>
3219 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3220                                         Register OrigOffset) const {
3221   const unsigned MaxImm = 4095;
3222   Register BaseReg;
3223   unsigned TotalConstOffset;
3224   MachineInstr *OffsetDef;
3225   const LLT S32 = LLT::scalar(32);
3226 
3227   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3228     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3229 
3230   unsigned ImmOffset = TotalConstOffset;
3231 
3232   // If the immediate value is too big for the immoffset field, put the value
3233   // and -4096 into the immoffset field so that the value that is copied/added
3234   // for the voffset field is a multiple of 4096, and it stands more chance
3235   // of being CSEd with the copy/add for another similar load/store.
3236   // However, do not do that rounding down to a multiple of 4096 if that is a
3237   // negative number, as it appears to be illegal to have a negative offset
3238   // in the vgpr, even if adding the immediate offset makes it positive.
3239   unsigned Overflow = ImmOffset & ~MaxImm;
3240   ImmOffset -= Overflow;
3241   if ((int32_t)Overflow < 0) {
3242     Overflow += ImmOffset;
3243     ImmOffset = 0;
3244   }
3245 
3246   if (Overflow != 0) {
3247     if (!BaseReg) {
3248       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3249     } else {
3250       auto OverflowVal = B.buildConstant(S32, Overflow);
3251       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3252     }
3253   }
3254 
3255   if (!BaseReg)
3256     BaseReg = B.buildConstant(S32, 0).getReg(0);
3257 
3258   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3259 }
3260 
3261 /// Handle register layout difference for f16 images for some subtargets.
3262 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3263                                              MachineRegisterInfo &MRI,
3264                                              Register Reg) const {
3265   if (!ST.hasUnpackedD16VMem())
3266     return Reg;
3267 
3268   const LLT S16 = LLT::scalar(16);
3269   const LLT S32 = LLT::scalar(32);
3270   LLT StoreVT = MRI.getType(Reg);
3271   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3272 
3273   auto Unmerge = B.buildUnmerge(S16, Reg);
3274 
3275   SmallVector<Register, 4> WideRegs;
3276   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3277     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3278 
3279   int NumElts = StoreVT.getNumElements();
3280 
3281   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3282 }
3283 
3284 Register AMDGPULegalizerInfo::fixStoreSourceType(
3285   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3286   MachineRegisterInfo *MRI = B.getMRI();
3287   LLT Ty = MRI->getType(VData);
3288 
3289   const LLT S16 = LLT::scalar(16);
3290 
3291   // Fixup illegal register types for i8 stores.
3292   if (Ty == LLT::scalar(8) || Ty == S16) {
3293     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3294     return AnyExt;
3295   }
3296 
3297   if (Ty.isVector()) {
3298     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3299       if (IsFormat)
3300         return handleD16VData(B, *MRI, VData);
3301     }
3302   }
3303 
3304   return VData;
3305 }
3306 
3307 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3308                                               MachineRegisterInfo &MRI,
3309                                               MachineIRBuilder &B,
3310                                               bool IsTyped,
3311                                               bool IsFormat) const {
3312   Register VData = MI.getOperand(1).getReg();
3313   LLT Ty = MRI.getType(VData);
3314   LLT EltTy = Ty.getScalarType();
3315   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3316   const LLT S32 = LLT::scalar(32);
3317 
3318   VData = fixStoreSourceType(B, VData, IsFormat);
3319   Register RSrc = MI.getOperand(2).getReg();
3320 
3321   MachineMemOperand *MMO = *MI.memoperands_begin();
3322   const int MemSize = MMO->getSize();
3323 
3324   unsigned ImmOffset;
3325   unsigned TotalOffset;
3326 
3327   // The typed intrinsics add an immediate after the registers.
3328   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3329 
3330   // The struct intrinsic variants add one additional operand over raw.
3331   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3332   Register VIndex;
3333   int OpOffset = 0;
3334   if (HasVIndex) {
3335     VIndex = MI.getOperand(3).getReg();
3336     OpOffset = 1;
3337   }
3338 
3339   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3340   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3341 
3342   unsigned Format = 0;
3343   if (IsTyped) {
3344     Format = MI.getOperand(5 + OpOffset).getImm();
3345     ++OpOffset;
3346   }
3347 
3348   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3349 
3350   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3351   if (TotalOffset != 0)
3352     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3353 
3354   unsigned Opc;
3355   if (IsTyped) {
3356     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3357                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3358   } else if (IsFormat) {
3359     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3360                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3361   } else {
3362     switch (MemSize) {
3363     case 1:
3364       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3365       break;
3366     case 2:
3367       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3368       break;
3369     default:
3370       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3371       break;
3372     }
3373   }
3374 
3375   if (!VIndex)
3376     VIndex = B.buildConstant(S32, 0).getReg(0);
3377 
3378   auto MIB = B.buildInstr(Opc)
3379     .addUse(VData)              // vdata
3380     .addUse(RSrc)               // rsrc
3381     .addUse(VIndex)             // vindex
3382     .addUse(VOffset)            // voffset
3383     .addUse(SOffset)            // soffset
3384     .addImm(ImmOffset);         // offset(imm)
3385 
3386   if (IsTyped)
3387     MIB.addImm(Format);
3388 
3389   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3390      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3391      .addMemOperand(MMO);
3392 
3393   MI.eraseFromParent();
3394   return true;
3395 }
3396 
3397 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3398                                              MachineRegisterInfo &MRI,
3399                                              MachineIRBuilder &B,
3400                                              bool IsFormat,
3401                                              bool IsTyped) const {
3402   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3403   MachineMemOperand *MMO = *MI.memoperands_begin();
3404   const int MemSize = MMO->getSize();
3405   const LLT S32 = LLT::scalar(32);
3406 
3407   Register Dst = MI.getOperand(0).getReg();
3408   Register RSrc = MI.getOperand(2).getReg();
3409 
3410   // The typed intrinsics add an immediate after the registers.
3411   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3412 
3413   // The struct intrinsic variants add one additional operand over raw.
3414   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3415   Register VIndex;
3416   int OpOffset = 0;
3417   if (HasVIndex) {
3418     VIndex = MI.getOperand(3).getReg();
3419     OpOffset = 1;
3420   }
3421 
3422   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3423   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3424 
3425   unsigned Format = 0;
3426   if (IsTyped) {
3427     Format = MI.getOperand(5 + OpOffset).getImm();
3428     ++OpOffset;
3429   }
3430 
3431   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3432   unsigned ImmOffset;
3433   unsigned TotalOffset;
3434 
3435   LLT Ty = MRI.getType(Dst);
3436   LLT EltTy = Ty.getScalarType();
3437   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3438   const bool Unpacked = ST.hasUnpackedD16VMem();
3439 
3440   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3441   if (TotalOffset != 0)
3442     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3443 
3444   unsigned Opc;
3445 
3446   if (IsTyped) {
3447     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3448                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3449   } else if (IsFormat) {
3450     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3451                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3452   } else {
3453     switch (MemSize) {
3454     case 1:
3455       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3456       break;
3457     case 2:
3458       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3459       break;
3460     default:
3461       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3462       break;
3463     }
3464   }
3465 
3466   Register LoadDstReg;
3467 
3468   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3469   LLT UnpackedTy = Ty.changeElementSize(32);
3470 
3471   if (IsExtLoad)
3472     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3473   else if (Unpacked && IsD16 && Ty.isVector())
3474     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3475   else
3476     LoadDstReg = Dst;
3477 
3478   if (!VIndex)
3479     VIndex = B.buildConstant(S32, 0).getReg(0);
3480 
3481   auto MIB = B.buildInstr(Opc)
3482     .addDef(LoadDstReg)         // vdata
3483     .addUse(RSrc)               // rsrc
3484     .addUse(VIndex)             // vindex
3485     .addUse(VOffset)            // voffset
3486     .addUse(SOffset)            // soffset
3487     .addImm(ImmOffset);         // offset(imm)
3488 
3489   if (IsTyped)
3490     MIB.addImm(Format);
3491 
3492   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3493      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3494      .addMemOperand(MMO);
3495 
3496   if (LoadDstReg != Dst) {
3497     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3498 
3499     // Widen result for extending loads was widened.
3500     if (IsExtLoad)
3501       B.buildTrunc(Dst, LoadDstReg);
3502     else {
3503       // Repack to original 16-bit vector result
3504       // FIXME: G_TRUNC should work, but legalization currently fails
3505       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3506       SmallVector<Register, 4> Repack;
3507       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3508         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3509       B.buildMerge(Dst, Repack);
3510     }
3511   }
3512 
3513   MI.eraseFromParent();
3514   return true;
3515 }
3516 
3517 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3518                                                MachineIRBuilder &B,
3519                                                bool IsInc) const {
3520   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3521                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3522   B.buildInstr(Opc)
3523     .addDef(MI.getOperand(0).getReg())
3524     .addUse(MI.getOperand(2).getReg())
3525     .addUse(MI.getOperand(3).getReg())
3526     .cloneMemRefs(MI);
3527   MI.eraseFromParent();
3528   return true;
3529 }
3530 
3531 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3532   switch (IntrID) {
3533   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3534   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3535     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3536   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3537   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3538     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3539   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3540   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3541     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3542   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3543   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3544     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3545   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3546   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3547     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3548   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3549   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3550     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3551   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3552   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3553     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3554   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3555   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3556     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3557   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3558   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3559     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3560   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3561   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3562     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3563   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3564   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3565     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3566   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3567   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3568     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3569   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3570   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3571     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3572   default:
3573     llvm_unreachable("unhandled atomic opcode");
3574   }
3575 }
3576 
3577 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3578                                                MachineIRBuilder &B,
3579                                                Intrinsic::ID IID) const {
3580   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3581                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3582 
3583   Register Dst = MI.getOperand(0).getReg();
3584   Register VData = MI.getOperand(2).getReg();
3585 
3586   Register CmpVal;
3587   int OpOffset = 0;
3588 
3589   if (IsCmpSwap) {
3590     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3591     ++OpOffset;
3592   }
3593 
3594   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3595   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3596 
3597   // The struct intrinsic variants add one additional operand over raw.
3598   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3599   Register VIndex;
3600   if (HasVIndex) {
3601     VIndex = MI.getOperand(4 + OpOffset).getReg();
3602     ++OpOffset;
3603   }
3604 
3605   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3606   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3607   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3608 
3609   MachineMemOperand *MMO = *MI.memoperands_begin();
3610 
3611   unsigned ImmOffset;
3612   unsigned TotalOffset;
3613   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3614   if (TotalOffset != 0)
3615     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3616 
3617   if (!VIndex)
3618     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3619 
3620   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3621     .addDef(Dst)
3622     .addUse(VData); // vdata
3623 
3624   if (IsCmpSwap)
3625     MIB.addReg(CmpVal);
3626 
3627   MIB.addUse(RSrc)               // rsrc
3628      .addUse(VIndex)             // vindex
3629      .addUse(VOffset)            // voffset
3630      .addUse(SOffset)            // soffset
3631      .addImm(ImmOffset)          // offset(imm)
3632      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3633      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3634      .addMemOperand(MMO);
3635 
3636   MI.eraseFromParent();
3637   return true;
3638 }
3639 
3640 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3641 /// vector with s16 typed elements.
3642 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3643                                         SmallVectorImpl<Register> &PackedAddrs,
3644                                         int AddrIdx, int DimIdx, int EndIdx,
3645                                         int NumGradients) {
3646   const LLT S16 = LLT::scalar(16);
3647   const LLT V2S16 = LLT::vector(2, 16);
3648 
3649   for (int I = AddrIdx; I < EndIdx; ++I) {
3650     MachineOperand &SrcOp = MI.getOperand(I);
3651     if (!SrcOp.isReg())
3652       continue; // _L to _LZ may have eliminated this.
3653 
3654     Register AddrReg = SrcOp.getReg();
3655 
3656     if (I < DimIdx) {
3657       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3658       PackedAddrs.push_back(AddrReg);
3659     } else {
3660       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3661       // derivatives dx/dh and dx/dv are packed with undef.
3662       if (((I + 1) >= EndIdx) ||
3663           ((NumGradients / 2) % 2 == 1 &&
3664            (I == DimIdx + (NumGradients / 2) - 1 ||
3665             I == DimIdx + NumGradients - 1)) ||
3666           // Check for _L to _LZ optimization
3667           !MI.getOperand(I + 1).isReg()) {
3668         PackedAddrs.push_back(
3669             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3670                 .getReg(0));
3671       } else {
3672         PackedAddrs.push_back(
3673             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3674                 .getReg(0));
3675         ++I;
3676       }
3677     }
3678   }
3679 }
3680 
3681 /// Convert from separate vaddr components to a single vector address register,
3682 /// and replace the remaining operands with $noreg.
3683 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3684                                      int DimIdx, int NumVAddrs) {
3685   const LLT S32 = LLT::scalar(32);
3686 
3687   SmallVector<Register, 8> AddrRegs;
3688   for (int I = 0; I != NumVAddrs; ++I) {
3689     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3690     if (SrcOp.isReg()) {
3691       AddrRegs.push_back(SrcOp.getReg());
3692       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3693     }
3694   }
3695 
3696   int NumAddrRegs = AddrRegs.size();
3697   if (NumAddrRegs != 1) {
3698     // Round up to 8 elements for v5-v7
3699     // FIXME: Missing intermediate sized register classes and instructions.
3700     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3701       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3702       auto Undef = B.buildUndef(S32);
3703       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3704       NumAddrRegs = RoundedNumRegs;
3705     }
3706 
3707     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3708     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3709   }
3710 
3711   for (int I = 1; I != NumVAddrs; ++I) {
3712     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3713     if (SrcOp.isReg())
3714       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3715   }
3716 }
3717 
3718 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3719 ///
3720 /// Depending on the subtarget, load/store with 16-bit element data need to be
3721 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3722 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3723 /// registers.
3724 ///
3725 /// We don't want to directly select image instructions just yet, but also want
3726 /// to exposes all register repacking to the legalizer/combiners. We also don't
3727 /// want a selected instrution entering RegBankSelect. In order to avoid
3728 /// defining a multitude of intermediate image instructions, directly hack on
3729 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3730 /// now unnecessary arguments with $noreg.
3731 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3732     MachineInstr &MI, MachineIRBuilder &B,
3733     GISelChangeObserver &Observer,
3734     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3735 
3736   const int NumDefs = MI.getNumExplicitDefs();
3737   bool IsTFE = NumDefs == 2;
3738   // We are only processing the operands of d16 image operations on subtargets
3739   // that use the unpacked register layout, or need to repack the TFE result.
3740 
3741   // TODO: Do we need to guard against already legalized intrinsics?
3742   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3743     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3744 
3745   MachineRegisterInfo *MRI = B.getMRI();
3746   const LLT S32 = LLT::scalar(32);
3747   const LLT S16 = LLT::scalar(16);
3748   const LLT V2S16 = LLT::vector(2, 16);
3749 
3750   // Index of first address argument
3751   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3752 
3753   int NumVAddrs, NumGradients;
3754   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3755   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3756     getDMaskIdx(BaseOpcode, NumDefs);
3757   unsigned DMask = 0;
3758 
3759   // Check for 16 bit addresses and pack if true.
3760   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3761   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3762   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3763   const bool IsG16 = GradTy == S16;
3764   const bool IsA16 = AddrTy == S16;
3765 
3766   int DMaskLanes = 0;
3767   if (!BaseOpcode->Atomic) {
3768     DMask = MI.getOperand(DMaskIdx).getImm();
3769     if (BaseOpcode->Gather4) {
3770       DMaskLanes = 4;
3771     } else if (DMask != 0) {
3772       DMaskLanes = countPopulation(DMask);
3773     } else if (!IsTFE && !BaseOpcode->Store) {
3774       // If dmask is 0, this is a no-op load. This can be eliminated.
3775       B.buildUndef(MI.getOperand(0));
3776       MI.eraseFromParent();
3777       return true;
3778     }
3779   }
3780 
3781   Observer.changingInstr(MI);
3782   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3783 
3784   unsigned NewOpcode = NumDefs == 0 ?
3785     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3786 
3787   // Track that we legalized this
3788   MI.setDesc(B.getTII().get(NewOpcode));
3789 
3790   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3791   // dmask to be at least 1 otherwise the instruction will fail
3792   if (IsTFE && DMask == 0) {
3793     DMask = 0x1;
3794     DMaskLanes = 1;
3795     MI.getOperand(DMaskIdx).setImm(DMask);
3796   }
3797 
3798   if (BaseOpcode->Atomic) {
3799     Register VData0 = MI.getOperand(2).getReg();
3800     LLT Ty = MRI->getType(VData0);
3801 
3802     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3803     if (Ty.isVector())
3804       return false;
3805 
3806     if (BaseOpcode->AtomicX2) {
3807       Register VData1 = MI.getOperand(3).getReg();
3808       // The two values are packed in one register.
3809       LLT PackedTy = LLT::vector(2, Ty);
3810       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3811       MI.getOperand(2).setReg(Concat.getReg(0));
3812       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3813     }
3814   }
3815 
3816   int CorrectedNumVAddrs = NumVAddrs;
3817 
3818   // Optimize _L to _LZ when _L is zero
3819   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3820         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3821     const ConstantFP *ConstantLod;
3822     const int LodIdx = AddrIdx + NumVAddrs - 1;
3823 
3824     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3825       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3826         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3827         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3828           LZMappingInfo->LZ, ImageDimIntr->Dim);
3829 
3830         // The starting indexes should remain in the same place.
3831         --NumVAddrs;
3832         --CorrectedNumVAddrs;
3833 
3834         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3835           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3836         MI.RemoveOperand(LodIdx);
3837       }
3838     }
3839   }
3840 
3841   // Optimize _mip away, when 'lod' is zero
3842   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3843     int64_t ConstantLod;
3844     const int LodIdx = AddrIdx + NumVAddrs - 1;
3845 
3846     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3847       if (ConstantLod == 0) {
3848         // TODO: Change intrinsic opcode and remove operand instead or replacing
3849         // it with 0, as the _L to _LZ handling is done above.
3850         MI.getOperand(LodIdx).ChangeToImmediate(0);
3851         --CorrectedNumVAddrs;
3852       }
3853     }
3854   }
3855 
3856   // Rewrite the addressing register layout before doing anything else.
3857   if (IsA16 || IsG16) {
3858     if (IsA16) {
3859       // Target must support the feature and gradients need to be 16 bit too
3860       if (!ST.hasA16() || !IsG16)
3861         return false;
3862     } else if (!ST.hasG16())
3863       return false;
3864 
3865     if (NumVAddrs > 1) {
3866       SmallVector<Register, 4> PackedRegs;
3867       // Don't compress addresses for G16
3868       const int PackEndIdx =
3869           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3870       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3871                                   PackEndIdx, NumGradients);
3872 
3873       if (!IsA16) {
3874         // Add uncompressed address
3875         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3876           int AddrReg = MI.getOperand(I).getReg();
3877           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3878           PackedRegs.push_back(AddrReg);
3879         }
3880       }
3881 
3882       // See also below in the non-a16 branch
3883       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3884 
3885       if (!UseNSA && PackedRegs.size() > 1) {
3886         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3887         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3888         PackedRegs[0] = Concat.getReg(0);
3889         PackedRegs.resize(1);
3890       }
3891 
3892       const int NumPacked = PackedRegs.size();
3893       for (int I = 0; I != NumVAddrs; ++I) {
3894         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3895         if (!SrcOp.isReg()) {
3896           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3897           continue;
3898         }
3899 
3900         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3901 
3902         if (I < NumPacked)
3903           SrcOp.setReg(PackedRegs[I]);
3904         else
3905           SrcOp.setReg(AMDGPU::NoRegister);
3906       }
3907     }
3908   } else {
3909     // If the register allocator cannot place the address registers contiguously
3910     // without introducing moves, then using the non-sequential address encoding
3911     // is always preferable, since it saves VALU instructions and is usually a
3912     // wash in terms of code size or even better.
3913     //
3914     // However, we currently have no way of hinting to the register allocator
3915     // that MIMG addresses should be placed contiguously when it is possible to
3916     // do so, so force non-NSA for the common 2-address case as a heuristic.
3917     //
3918     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3919     // allocation when possible.
3920     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3921 
3922     if (!UseNSA && NumVAddrs > 1)
3923       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3924   }
3925 
3926   int Flags = 0;
3927   if (IsA16)
3928     Flags |= 1;
3929   if (IsG16)
3930     Flags |= 2;
3931   MI.addOperand(MachineOperand::CreateImm(Flags));
3932 
3933   if (BaseOpcode->Store) { // No TFE for stores?
3934     // TODO: Handle dmask trim
3935     Register VData = MI.getOperand(1).getReg();
3936     LLT Ty = MRI->getType(VData);
3937     if (!Ty.isVector() || Ty.getElementType() != S16)
3938       return true;
3939 
3940     Register RepackedReg = handleD16VData(B, *MRI, VData);
3941     if (RepackedReg != VData) {
3942       MI.getOperand(1).setReg(RepackedReg);
3943     }
3944 
3945     return true;
3946   }
3947 
3948   Register DstReg = MI.getOperand(0).getReg();
3949   LLT Ty = MRI->getType(DstReg);
3950   const LLT EltTy = Ty.getScalarType();
3951   const bool IsD16 = Ty.getScalarType() == S16;
3952   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3953 
3954   // Confirm that the return type is large enough for the dmask specified
3955   if (NumElts < DMaskLanes)
3956     return false;
3957 
3958   if (NumElts > 4 || DMaskLanes > 4)
3959     return false;
3960 
3961   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3962   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3963 
3964   // The raw dword aligned data component of the load. The only legal cases
3965   // where this matters should be when using the packed D16 format, for
3966   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3967   LLT RoundedTy;
3968 
3969   // S32 vector to to cover all data, plus TFE result element.
3970   LLT TFETy;
3971 
3972   // Register type to use for each loaded component. Will be S32 or V2S16.
3973   LLT RegTy;
3974 
3975   if (IsD16 && ST.hasUnpackedD16VMem()) {
3976     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3977     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3978     RegTy = S32;
3979   } else {
3980     unsigned EltSize = EltTy.getSizeInBits();
3981     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3982     unsigned RoundedSize = 32 * RoundedElts;
3983     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3984     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3985     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3986   }
3987 
3988   // The return type does not need adjustment.
3989   // TODO: Should we change s16 case to s32 or <2 x s16>?
3990   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3991     return true;
3992 
3993   Register Dst1Reg;
3994 
3995   // Insert after the instruction.
3996   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3997 
3998   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3999   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4000   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4001   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4002 
4003   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4004 
4005   MI.getOperand(0).setReg(NewResultReg);
4006 
4007   // In the IR, TFE is supposed to be used with a 2 element struct return
4008   // type. The intruction really returns these two values in one contiguous
4009   // register, with one additional dword beyond the loaded data. Rewrite the
4010   // return type to use a single register result.
4011 
4012   if (IsTFE) {
4013     Dst1Reg = MI.getOperand(1).getReg();
4014     if (MRI->getType(Dst1Reg) != S32)
4015       return false;
4016 
4017     // TODO: Make sure the TFE operand bit is set.
4018     MI.RemoveOperand(1);
4019 
4020     // Handle the easy case that requires no repack instructions.
4021     if (Ty == S32) {
4022       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4023       return true;
4024     }
4025   }
4026 
4027   // Now figure out how to copy the new result register back into the old
4028   // result.
4029   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4030 
4031   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4032 
4033   if (ResultNumRegs == 1) {
4034     assert(!IsTFE);
4035     ResultRegs[0] = NewResultReg;
4036   } else {
4037     // We have to repack into a new vector of some kind.
4038     for (int I = 0; I != NumDataRegs; ++I)
4039       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4040     B.buildUnmerge(ResultRegs, NewResultReg);
4041 
4042     // Drop the final TFE element to get the data part. The TFE result is
4043     // directly written to the right place already.
4044     if (IsTFE)
4045       ResultRegs.resize(NumDataRegs);
4046   }
4047 
4048   // For an s16 scalar result, we form an s32 result with a truncate regardless
4049   // of packed vs. unpacked.
4050   if (IsD16 && !Ty.isVector()) {
4051     B.buildTrunc(DstReg, ResultRegs[0]);
4052     return true;
4053   }
4054 
4055   // Avoid a build/concat_vector of 1 entry.
4056   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4057     B.buildBitcast(DstReg, ResultRegs[0]);
4058     return true;
4059   }
4060 
4061   assert(Ty.isVector());
4062 
4063   if (IsD16) {
4064     // For packed D16 results with TFE enabled, all the data components are
4065     // S32. Cast back to the expected type.
4066     //
4067     // TODO: We don't really need to use load s32 elements. We would only need one
4068     // cast for the TFE result if a multiple of v2s16 was used.
4069     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4070       for (Register &Reg : ResultRegs)
4071         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4072     } else if (ST.hasUnpackedD16VMem()) {
4073       for (Register &Reg : ResultRegs)
4074         Reg = B.buildTrunc(S16, Reg).getReg(0);
4075     }
4076   }
4077 
4078   auto padWithUndef = [&](LLT Ty, int NumElts) {
4079     if (NumElts == 0)
4080       return;
4081     Register Undef = B.buildUndef(Ty).getReg(0);
4082     for (int I = 0; I != NumElts; ++I)
4083       ResultRegs.push_back(Undef);
4084   };
4085 
4086   // Pad out any elements eliminated due to the dmask.
4087   LLT ResTy = MRI->getType(ResultRegs[0]);
4088   if (!ResTy.isVector()) {
4089     padWithUndef(ResTy, NumElts - ResultRegs.size());
4090     B.buildBuildVector(DstReg, ResultRegs);
4091     return true;
4092   }
4093 
4094   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4095   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4096 
4097   // Deal with the one annoying legal case.
4098   const LLT V3S16 = LLT::vector(3, 16);
4099   if (Ty == V3S16) {
4100     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4101     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4102     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4103     return true;
4104   }
4105 
4106   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4107   B.buildConcatVectors(DstReg, ResultRegs);
4108   return true;
4109 }
4110 
4111 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4112   MachineInstr &MI, MachineIRBuilder &B,
4113   GISelChangeObserver &Observer) const {
4114   Register Dst = MI.getOperand(0).getReg();
4115   LLT Ty = B.getMRI()->getType(Dst);
4116   unsigned Size = Ty.getSizeInBits();
4117   MachineFunction &MF = B.getMF();
4118 
4119   Observer.changingInstr(MI);
4120 
4121   // FIXME: We don't really need this intermediate instruction. The intrinsic
4122   // should be fixed to have a memory operand. Since it's readnone, we're not
4123   // allowed to add one.
4124   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4125   MI.RemoveOperand(1); // Remove intrinsic ID
4126 
4127   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4128   // TODO: Should this use datalayout alignment?
4129   const unsigned MemSize = (Size + 7) / 8;
4130   const Align MemAlign(4);
4131   MachineMemOperand *MMO = MF.getMachineMemOperand(
4132       MachinePointerInfo(),
4133       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4134           MachineMemOperand::MOInvariant,
4135       MemSize, MemAlign);
4136   MI.addMemOperand(MF, MMO);
4137 
4138   // There are no 96-bit result scalar loads, but widening to 128-bit should
4139   // always be legal. We may need to restore this to a 96-bit result if it turns
4140   // out this needs to be converted to a vector load during RegBankSelect.
4141   if (!isPowerOf2_32(Size)) {
4142     LegalizerHelper Helper(MF, *this, Observer, B);
4143 
4144     if (Ty.isVector())
4145       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4146     else
4147       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4148   }
4149 
4150   Observer.changedInstr(MI);
4151   return true;
4152 }
4153 
4154 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4155                                                 MachineRegisterInfo &MRI,
4156                                                 MachineIRBuilder &B) const {
4157   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4158   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4159       !ST.isTrapHandlerEnabled()) {
4160     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4161   } else {
4162     // Pass queue pointer to trap handler as input, and insert trap instruction
4163     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4164     const ArgDescriptor *Arg =
4165         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4166     if (!Arg)
4167       return false;
4168     MachineRegisterInfo &MRI = *B.getMRI();
4169     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4170     Register LiveIn = getLiveInRegister(
4171         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4172         /*InsertLiveInCopy=*/false);
4173     if (!loadInputValue(LiveIn, B, Arg))
4174       return false;
4175     B.buildCopy(SGPR01, LiveIn);
4176     B.buildInstr(AMDGPU::S_TRAP)
4177         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4178         .addReg(SGPR01, RegState::Implicit);
4179   }
4180 
4181   MI.eraseFromParent();
4182   return true;
4183 }
4184 
4185 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4186     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4187   // Is non-HSA path or trap-handler disabled? then, report a warning
4188   // accordingly
4189   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4190       !ST.isTrapHandlerEnabled()) {
4191     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4192                                      "debugtrap handler not supported",
4193                                      MI.getDebugLoc(), DS_Warning);
4194     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4195     Ctx.diagnose(NoTrap);
4196   } else {
4197     // Insert debug-trap instruction
4198     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4199   }
4200 
4201   MI.eraseFromParent();
4202   return true;
4203 }
4204 
4205 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4206                                             MachineInstr &MI) const {
4207   MachineIRBuilder &B = Helper.MIRBuilder;
4208   MachineRegisterInfo &MRI = *B.getMRI();
4209 
4210   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4211   auto IntrID = MI.getIntrinsicID();
4212   switch (IntrID) {
4213   case Intrinsic::amdgcn_if:
4214   case Intrinsic::amdgcn_else: {
4215     MachineInstr *Br = nullptr;
4216     MachineBasicBlock *UncondBrTarget = nullptr;
4217     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4218       const SIRegisterInfo *TRI
4219         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4220 
4221       Register Def = MI.getOperand(1).getReg();
4222       Register Use = MI.getOperand(3).getReg();
4223 
4224       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4225       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4226       if (IntrID == Intrinsic::amdgcn_if) {
4227         B.buildInstr(AMDGPU::SI_IF)
4228           .addDef(Def)
4229           .addUse(Use)
4230           .addMBB(UncondBrTarget);
4231       } else {
4232         B.buildInstr(AMDGPU::SI_ELSE)
4233           .addDef(Def)
4234           .addUse(Use)
4235           .addMBB(UncondBrTarget)
4236           .addImm(0);
4237       }
4238 
4239       if (Br) {
4240         Br->getOperand(0).setMBB(CondBrTarget);
4241       } else {
4242         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4243         // since we're swapping branch targets it needs to be reinserted.
4244         // FIXME: IRTranslator should probably not do this
4245         B.buildBr(*CondBrTarget);
4246       }
4247 
4248       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4249       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4250       MI.eraseFromParent();
4251       BrCond->eraseFromParent();
4252       return true;
4253     }
4254 
4255     return false;
4256   }
4257   case Intrinsic::amdgcn_loop: {
4258     MachineInstr *Br = nullptr;
4259     MachineBasicBlock *UncondBrTarget = nullptr;
4260     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4261       const SIRegisterInfo *TRI
4262         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4263 
4264       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4265       Register Reg = MI.getOperand(2).getReg();
4266 
4267       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4268       B.buildInstr(AMDGPU::SI_LOOP)
4269         .addUse(Reg)
4270         .addMBB(UncondBrTarget);
4271 
4272       if (Br)
4273         Br->getOperand(0).setMBB(CondBrTarget);
4274       else
4275         B.buildBr(*CondBrTarget);
4276 
4277       MI.eraseFromParent();
4278       BrCond->eraseFromParent();
4279       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4280       return true;
4281     }
4282 
4283     return false;
4284   }
4285   case Intrinsic::amdgcn_kernarg_segment_ptr:
4286     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4287       // This only makes sense to call in a kernel, so just lower to null.
4288       B.buildConstant(MI.getOperand(0).getReg(), 0);
4289       MI.eraseFromParent();
4290       return true;
4291     }
4292 
4293     return legalizePreloadedArgIntrin(
4294       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4295   case Intrinsic::amdgcn_implicitarg_ptr:
4296     return legalizeImplicitArgPtr(MI, MRI, B);
4297   case Intrinsic::amdgcn_workitem_id_x:
4298     return legalizePreloadedArgIntrin(MI, MRI, B,
4299                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4300   case Intrinsic::amdgcn_workitem_id_y:
4301     return legalizePreloadedArgIntrin(MI, MRI, B,
4302                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4303   case Intrinsic::amdgcn_workitem_id_z:
4304     return legalizePreloadedArgIntrin(MI, MRI, B,
4305                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4306   case Intrinsic::amdgcn_workgroup_id_x:
4307     return legalizePreloadedArgIntrin(MI, MRI, B,
4308                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4309   case Intrinsic::amdgcn_workgroup_id_y:
4310     return legalizePreloadedArgIntrin(MI, MRI, B,
4311                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4312   case Intrinsic::amdgcn_workgroup_id_z:
4313     return legalizePreloadedArgIntrin(MI, MRI, B,
4314                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4315   case Intrinsic::amdgcn_dispatch_ptr:
4316     return legalizePreloadedArgIntrin(MI, MRI, B,
4317                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4318   case Intrinsic::amdgcn_queue_ptr:
4319     return legalizePreloadedArgIntrin(MI, MRI, B,
4320                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4321   case Intrinsic::amdgcn_implicit_buffer_ptr:
4322     return legalizePreloadedArgIntrin(
4323       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4324   case Intrinsic::amdgcn_dispatch_id:
4325     return legalizePreloadedArgIntrin(MI, MRI, B,
4326                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4327   case Intrinsic::amdgcn_fdiv_fast:
4328     return legalizeFDIVFastIntrin(MI, MRI, B);
4329   case Intrinsic::amdgcn_is_shared:
4330     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4331   case Intrinsic::amdgcn_is_private:
4332     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4333   case Intrinsic::amdgcn_wavefrontsize: {
4334     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4335     MI.eraseFromParent();
4336     return true;
4337   }
4338   case Intrinsic::amdgcn_s_buffer_load:
4339     return legalizeSBufferLoad(MI, B, Helper.Observer);
4340   case Intrinsic::amdgcn_raw_buffer_store:
4341   case Intrinsic::amdgcn_struct_buffer_store:
4342     return legalizeBufferStore(MI, MRI, B, false, false);
4343   case Intrinsic::amdgcn_raw_buffer_store_format:
4344   case Intrinsic::amdgcn_struct_buffer_store_format:
4345     return legalizeBufferStore(MI, MRI, B, false, true);
4346   case Intrinsic::amdgcn_raw_tbuffer_store:
4347   case Intrinsic::amdgcn_struct_tbuffer_store:
4348     return legalizeBufferStore(MI, MRI, B, true, true);
4349   case Intrinsic::amdgcn_raw_buffer_load:
4350   case Intrinsic::amdgcn_struct_buffer_load:
4351     return legalizeBufferLoad(MI, MRI, B, false, false);
4352   case Intrinsic::amdgcn_raw_buffer_load_format:
4353   case Intrinsic::amdgcn_struct_buffer_load_format:
4354     return legalizeBufferLoad(MI, MRI, B, true, false);
4355   case Intrinsic::amdgcn_raw_tbuffer_load:
4356   case Intrinsic::amdgcn_struct_tbuffer_load:
4357     return legalizeBufferLoad(MI, MRI, B, true, true);
4358   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4359   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4360   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4361   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4362   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4363   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4364   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4365   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4366   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4374   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4375   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4376   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4377   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4378   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4379   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4380   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4381   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4382   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4383   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4384     return legalizeBufferAtomic(MI, B, IntrID);
4385   case Intrinsic::amdgcn_atomic_inc:
4386     return legalizeAtomicIncDec(MI, B, true);
4387   case Intrinsic::amdgcn_atomic_dec:
4388     return legalizeAtomicIncDec(MI, B, false);
4389   case Intrinsic::trap:
4390     return legalizeTrapIntrinsic(MI, MRI, B);
4391   case Intrinsic::debugtrap:
4392     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4393   default: {
4394     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4395             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4396       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4397     return true;
4398   }
4399   }
4400 
4401   return true;
4402 }
4403