1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
426     // Full set of gfx9 features.
427     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
428       .legalFor({S32, S16, V2S16})
429       .clampScalar(0, S16, S32)
430       .clampMaxNumElements(0, S16, 2)
431       .scalarize(0)
432       .widenScalarToNextPow2(0, 32);
433 
434     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
435       .legalFor({S32, S16, V2S16}) // Clamp modifier
436       .minScalar(0, S16)
437       .clampMaxNumElements(0, S16, 2)
438       .scalarize(0)
439       .widenScalarToNextPow2(0, 32)
440       .lower();
441   } else if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
443       .legalFor({S32, S16})
444       .clampScalar(0, S16, S32)
445       .scalarize(0)
446       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
447 
448     // Technically the saturating operations require clamp bit support, but this
449     // was introduced at the same time as 16-bit operations.
450     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
451       .legalFor({S32, S16}) // Clamp modifier
452       .minScalar(0, S16)
453       .scalarize(0)
454       .widenScalarToNextPow2(0, 16)
455       .lower();
456 
457     // We're just lowering this, but it helps get a better result to try to
458     // coerce to the desired type first.
459     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
460       .minScalar(0, S16)
461       .scalarize(0)
462       .lower();
463   } else {
464     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
465       .legalFor({S32})
466       .clampScalar(0, S32, S32)
467       .scalarize(0);
468 
469     if (ST.hasIntClamp()) {
470       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
471         .legalFor({S32}) // Clamp modifier.
472         .scalarize(0)
473         .minScalarOrElt(0, S32)
474         .lower();
475     } else {
476       // Clamp bit support was added in VI, along with 16-bit operations.
477       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
478         .minScalar(0, S32)
479         .scalarize(0)
480         .lower();
481     }
482 
483     // FIXME: DAG expansion gets better results. The widening uses the smaller
484     // range values and goes for the min/max lowering directly.
485     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
486       .minScalar(0, S32)
487       .scalarize(0)
488       .lower();
489   }
490 
491   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
492     .customFor({S32, S64})
493     .clampScalar(0, S32, S64)
494     .widenScalarToNextPow2(0, 32)
495     .scalarize(0);
496 
497   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
498     .legalFor({S32})
499     .clampScalar(0, S32, S32)
500     .scalarize(0);
501 
502   // Report legal for any types we can handle anywhere. For the cases only legal
503   // on the SALU, RegBankSelect will be able to re-legalize.
504   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
505     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
506     .clampScalar(0, S32, S64)
507     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
509     .widenScalarToNextPow2(0)
510     .scalarize(0);
511 
512   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
513                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
514     .legalFor({{S32, S1}, {S32, S32}})
515     .minScalar(0, S32)
516     // TODO: .scalarize(0)
517     .lower();
518 
519   getActionDefinitionsBuilder(G_BITCAST)
520     // Don't worry about the size constraint.
521     .legalIf(all(isRegisterType(0), isRegisterType(1)))
522     .lower();
523 
524 
525   getActionDefinitionsBuilder(G_CONSTANT)
526     .legalFor({S1, S32, S64, S16, GlobalPtr,
527                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
528     .legalIf(isPointer(0))
529     .clampScalar(0, S32, S64)
530     .widenScalarToNextPow2(0);
531 
532   getActionDefinitionsBuilder(G_FCONSTANT)
533     .legalFor({S32, S64, S16})
534     .clampScalar(0, S16, S64);
535 
536   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
537       .legalIf(isRegisterType(0))
538       // s1 and s16 are special cases because they have legal operations on
539       // them, but don't really occupy registers in the normal way.
540       .legalFor({S1, S16})
541       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542       .clampScalarOrElt(0, S32, MaxScalar)
543       .widenScalarToNextPow2(0, 32)
544       .clampMaxNumElements(0, S32, 16);
545 
546   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
547 
548   // If the amount is divergent, we have to do a wave reduction to get the
549   // maximum value, so this is expanded during RegBankSelect.
550   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
551     .legalFor({{PrivatePtr, S32}});
552 
553   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
554     .unsupportedFor({PrivatePtr})
555     .custom();
556   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
557 
558   auto &FPOpActions = getActionDefinitionsBuilder(
559     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
560     .legalFor({S32, S64});
561   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
562     .customFor({S32, S64});
563   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
564     .customFor({S32, S64});
565 
566   if (ST.has16BitInsts()) {
567     if (ST.hasVOP3PInsts())
568       FPOpActions.legalFor({S16, V2S16});
569     else
570       FPOpActions.legalFor({S16});
571 
572     TrigActions.customFor({S16});
573     FDIVActions.customFor({S16});
574   }
575 
576   auto &MinNumMaxNum = getActionDefinitionsBuilder({
577       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
578 
579   if (ST.hasVOP3PInsts()) {
580     MinNumMaxNum.customFor(FPTypesPK16)
581       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
582       .clampMaxNumElements(0, S16, 2)
583       .clampScalar(0, S16, S64)
584       .scalarize(0);
585   } else if (ST.has16BitInsts()) {
586     MinNumMaxNum.customFor(FPTypes16)
587       .clampScalar(0, S16, S64)
588       .scalarize(0);
589   } else {
590     MinNumMaxNum.customFor(FPTypesBase)
591       .clampScalar(0, S32, S64)
592       .scalarize(0);
593   }
594 
595   if (ST.hasVOP3PInsts())
596     FPOpActions.clampMaxNumElements(0, S16, 2);
597 
598   FPOpActions
599     .scalarize(0)
600     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
601 
602   TrigActions
603     .scalarize(0)
604     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
605 
606   FDIVActions
607     .scalarize(0)
608     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
609 
610   getActionDefinitionsBuilder({G_FNEG, G_FABS})
611     .legalFor(FPTypesPK16)
612     .clampMaxNumElements(0, S16, 2)
613     .scalarize(0)
614     .clampScalar(0, S16, S64);
615 
616   if (ST.has16BitInsts()) {
617     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
618       .legalFor({S32, S64, S16})
619       .scalarize(0)
620       .clampScalar(0, S16, S64);
621   } else {
622     getActionDefinitionsBuilder(G_FSQRT)
623       .legalFor({S32, S64})
624       .scalarize(0)
625       .clampScalar(0, S32, S64);
626 
627     if (ST.hasFractBug()) {
628       getActionDefinitionsBuilder(G_FFLOOR)
629         .customFor({S64})
630         .legalFor({S32, S64})
631         .scalarize(0)
632         .clampScalar(0, S32, S64);
633     } else {
634       getActionDefinitionsBuilder(G_FFLOOR)
635         .legalFor({S32, S64})
636         .scalarize(0)
637         .clampScalar(0, S32, S64);
638     }
639   }
640 
641   getActionDefinitionsBuilder(G_FPTRUNC)
642     .legalFor({{S32, S64}, {S16, S32}})
643     .scalarize(0)
644     .lower();
645 
646   getActionDefinitionsBuilder(G_FPEXT)
647     .legalFor({{S64, S32}, {S32, S16}})
648     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
649     .scalarize(0);
650 
651   getActionDefinitionsBuilder(G_FSUB)
652       // Use actual fsub instruction
653       .legalFor({S32})
654       // Must use fadd + fneg
655       .lowerFor({S64, S16, V2S16})
656       .scalarize(0)
657       .clampScalar(0, S32, S64);
658 
659   // Whether this is legal depends on the floating point mode for the function.
660   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
661   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
662     FMad.customFor({S32, S16});
663   else if (ST.hasMadMacF32Insts())
664     FMad.customFor({S32});
665   else if (ST.hasMadF16())
666     FMad.customFor({S16});
667   FMad.scalarize(0)
668       .lower();
669 
670   // TODO: Do we need to clamp maximum bitwidth?
671   getActionDefinitionsBuilder(G_TRUNC)
672     .legalIf(isScalar(0))
673     .legalFor({{V2S16, V2S32}})
674     .clampMaxNumElements(0, S16, 2)
675     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
676     // situations (like an invalid implicit use), we don't want to infinite loop
677     // in the legalizer.
678     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
679     .alwaysLegal();
680 
681   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
682     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
683                {S32, S1}, {S64, S1}, {S16, S1}})
684     .scalarize(0)
685     .clampScalar(0, S32, S64)
686     .widenScalarToNextPow2(1, 32);
687 
688   // TODO: Split s1->s64 during regbankselect for VALU.
689   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
690     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
691     .lowerFor({{S32, S64}})
692     .lowerIf(typeIs(1, S1))
693     .customFor({{S64, S64}});
694   if (ST.has16BitInsts())
695     IToFP.legalFor({{S16, S16}});
696   IToFP.clampScalar(1, S32, S64)
697        .minScalar(0, S32)
698        .scalarize(0)
699        .widenScalarToNextPow2(1);
700 
701   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
702     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
703     .customFor({{S64, S64}})
704     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
705   if (ST.has16BitInsts())
706     FPToI.legalFor({{S16, S16}});
707   else
708     FPToI.minScalar(1, S32);
709 
710   FPToI.minScalar(0, S32)
711        .scalarize(0)
712        .lower();
713 
714   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
715     .scalarize(0)
716     .lower();
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
720       .legalFor({S16, S32, S64})
721       .clampScalar(0, S16, S64)
722       .scalarize(0);
723   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
724     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
725       .legalFor({S32, S64})
726       .clampScalar(0, S32, S64)
727       .scalarize(0);
728   } else {
729     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
730       .legalFor({S32})
731       .customFor({S64})
732       .clampScalar(0, S32, S64)
733       .scalarize(0);
734   }
735 
736   getActionDefinitionsBuilder(G_PTR_ADD)
737     .legalIf(all(isPointer(0), sameSize(0, 1)))
738     .scalarize(0)
739     .scalarSameSizeAs(1, 0);
740 
741   getActionDefinitionsBuilder(G_PTRMASK)
742     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
743     .scalarSameSizeAs(1, 0)
744     .scalarize(0);
745 
746   auto &CmpBuilder =
747     getActionDefinitionsBuilder(G_ICMP)
748     // The compare output type differs based on the register bank of the output,
749     // so make both s1 and s32 legal.
750     //
751     // Scalar compares producing output in scc will be promoted to s32, as that
752     // is the allocatable register type that will be needed for the copy from
753     // scc. This will be promoted during RegBankSelect, and we assume something
754     // before that won't try to use s32 result types.
755     //
756     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
757     // bank.
758     .legalForCartesianProduct(
759       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
760     .legalForCartesianProduct(
761       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
762   if (ST.has16BitInsts()) {
763     CmpBuilder.legalFor({{S1, S16}});
764   }
765 
766   CmpBuilder
767     .widenScalarToNextPow2(1)
768     .clampScalar(1, S32, S64)
769     .scalarize(0)
770     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
771 
772   getActionDefinitionsBuilder(G_FCMP)
773     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
774     .widenScalarToNextPow2(1)
775     .clampScalar(1, S32, S64)
776     .scalarize(0);
777 
778   // FIXME: fpow has a selection pattern that should move to custom lowering.
779   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
780   if (ST.has16BitInsts())
781     Exp2Ops.legalFor({S32, S16});
782   else
783     Exp2Ops.legalFor({S32});
784   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
785   Exp2Ops.scalarize(0);
786 
787   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
788   if (ST.has16BitInsts())
789     ExpOps.customFor({{S32}, {S16}});
790   else
791     ExpOps.customFor({S32});
792   ExpOps.clampScalar(0, MinScalarFPTy, S32)
793         .scalarize(0);
794 
795   getActionDefinitionsBuilder(G_FPOWI)
796     .clampScalar(0, MinScalarFPTy, S32)
797     .lower();
798 
799   // The 64-bit versions produce 32-bit results, but only on the SALU.
800   getActionDefinitionsBuilder(G_CTPOP)
801     .legalFor({{S32, S32}, {S32, S64}})
802     .clampScalar(0, S32, S32)
803     .clampScalar(1, S32, S64)
804     .scalarize(0)
805     .widenScalarToNextPow2(0, 32)
806     .widenScalarToNextPow2(1, 32);
807 
808   // The hardware instructions return a different result on 0 than the generic
809   // instructions expect. The hardware produces -1, but these produce the
810   // bitwidth.
811   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
812     .scalarize(0)
813     .clampScalar(0, S32, S32)
814     .clampScalar(1, S32, S64)
815     .widenScalarToNextPow2(0, 32)
816     .widenScalarToNextPow2(1, 32)
817     .lower();
818 
819   // The 64-bit versions produce 32-bit results, but only on the SALU.
820   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
821     .legalFor({{S32, S32}, {S32, S64}})
822     .clampScalar(0, S32, S32)
823     .clampScalar(1, S32, S64)
824     .scalarize(0)
825     .widenScalarToNextPow2(0, 32)
826     .widenScalarToNextPow2(1, 32);
827 
828   getActionDefinitionsBuilder(G_BITREVERSE)
829     .legalFor({S32})
830     .clampScalar(0, S32, S32)
831     .scalarize(0);
832 
833   if (ST.has16BitInsts()) {
834     getActionDefinitionsBuilder(G_BSWAP)
835       .legalFor({S16, S32, V2S16})
836       .clampMaxNumElements(0, S16, 2)
837       // FIXME: Fixing non-power-of-2 before clamp is workaround for
838       // narrowScalar limitation.
839       .widenScalarToNextPow2(0)
840       .clampScalar(0, S16, S32)
841       .scalarize(0);
842 
843     if (ST.hasVOP3PInsts()) {
844       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
845         .legalFor({S32, S16, V2S16})
846         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
847         .clampMaxNumElements(0, S16, 2)
848         .minScalar(0, S16)
849         .widenScalarToNextPow2(0)
850         .scalarize(0)
851         .lower();
852     } else {
853       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
854         .legalFor({S32, S16})
855         .widenScalarToNextPow2(0)
856         .minScalar(0, S16)
857         .scalarize(0)
858         .lower();
859     }
860   } else {
861     // TODO: Should have same legality without v_perm_b32
862     getActionDefinitionsBuilder(G_BSWAP)
863       .legalFor({S32})
864       .lowerIf(scalarNarrowerThan(0, 32))
865       // FIXME: Fixing non-power-of-2 before clamp is workaround for
866       // narrowScalar limitation.
867       .widenScalarToNextPow2(0)
868       .maxScalar(0, S32)
869       .scalarize(0)
870       .lower();
871 
872     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
873       .legalFor({S32})
874       .minScalar(0, S32)
875       .widenScalarToNextPow2(0)
876       .scalarize(0)
877       .lower();
878   }
879 
880   getActionDefinitionsBuilder(G_INTTOPTR)
881     // List the common cases
882     .legalForCartesianProduct(AddrSpaces64, {S64})
883     .legalForCartesianProduct(AddrSpaces32, {S32})
884     .scalarize(0)
885     // Accept any address space as long as the size matches
886     .legalIf(sameSize(0, 1))
887     .widenScalarIf(smallerThan(1, 0),
888       [](const LegalityQuery &Query) {
889         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
890       })
891     .narrowScalarIf(largerThan(1, 0),
892       [](const LegalityQuery &Query) {
893         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
894       });
895 
896   getActionDefinitionsBuilder(G_PTRTOINT)
897     // List the common cases
898     .legalForCartesianProduct(AddrSpaces64, {S64})
899     .legalForCartesianProduct(AddrSpaces32, {S32})
900     .scalarize(0)
901     // Accept any address space as long as the size matches
902     .legalIf(sameSize(0, 1))
903     .widenScalarIf(smallerThan(0, 1),
904       [](const LegalityQuery &Query) {
905         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
906       })
907     .narrowScalarIf(
908       largerThan(0, 1),
909       [](const LegalityQuery &Query) {
910         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
911       });
912 
913   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
914     .scalarize(0)
915     .custom();
916 
917   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
918                                     bool IsLoad) -> bool {
919     const LLT DstTy = Query.Types[0];
920 
921     // Split vector extloads.
922     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
923     unsigned Align = Query.MMODescrs[0].AlignInBits;
924 
925     if (MemSize < DstTy.getSizeInBits())
926       MemSize = std::max(MemSize, Align);
927 
928     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
929       return true;
930 
931     const LLT PtrTy = Query.Types[1];
932     unsigned AS = PtrTy.getAddressSpace();
933     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
934       return true;
935 
936     // Catch weird sized loads that don't evenly divide into the access sizes
937     // TODO: May be able to widen depending on alignment etc.
938     unsigned NumRegs = (MemSize + 31) / 32;
939     if (NumRegs == 3) {
940       if (!ST.hasDwordx3LoadStores())
941         return true;
942     } else {
943       // If the alignment allows, these should have been widened.
944       if (!isPowerOf2_32(NumRegs))
945         return true;
946     }
947 
948     if (Align < MemSize) {
949       const SITargetLowering *TLI = ST.getTargetLowering();
950       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
951     }
952 
953     return false;
954   };
955 
956   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
957                                          unsigned Opc) -> bool {
958     unsigned Size = Query.Types[0].getSizeInBits();
959     if (isPowerOf2_32(Size))
960       return false;
961 
962     if (Size == 96 && ST.hasDwordx3LoadStores())
963       return false;
964 
965     unsigned AddrSpace = Query.Types[1].getAddressSpace();
966     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
967       return false;
968 
969     unsigned Align = Query.MMODescrs[0].AlignInBits;
970     unsigned RoundedSize = NextPowerOf2(Size);
971     return (Align >= RoundedSize);
972   };
973 
974   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
975   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
976   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
977 
978   // TODO: Refine based on subtargets which support unaligned access or 128-bit
979   // LDS
980   // TODO: Unsupported flat for SI.
981 
982   for (unsigned Op : {G_LOAD, G_STORE}) {
983     const bool IsStore = Op == G_STORE;
984 
985     auto &Actions = getActionDefinitionsBuilder(Op);
986     // Explicitly list some common cases.
987     // TODO: Does this help compile time at all?
988     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
989                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
990                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
991                                       {S64, GlobalPtr, 64, GlobalAlign32},
992                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
993                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
994                                       {S32, GlobalPtr, 8, GlobalAlign8},
995                                       {S32, GlobalPtr, 16, GlobalAlign16},
996 
997                                       {S32, LocalPtr, 32, 32},
998                                       {S64, LocalPtr, 64, 32},
999                                       {V2S32, LocalPtr, 64, 32},
1000                                       {S32, LocalPtr, 8, 8},
1001                                       {S32, LocalPtr, 16, 16},
1002                                       {V2S16, LocalPtr, 32, 32},
1003 
1004                                       {S32, PrivatePtr, 32, 32},
1005                                       {S32, PrivatePtr, 8, 8},
1006                                       {S32, PrivatePtr, 16, 16},
1007                                       {V2S16, PrivatePtr, 32, 32},
1008 
1009                                       {S32, ConstantPtr, 32, GlobalAlign32},
1010                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1011                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1012                                       {S64, ConstantPtr, 64, GlobalAlign32},
1013                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1014     Actions.legalIf(
1015       [=](const LegalityQuery &Query) -> bool {
1016         return isLoadStoreLegal(ST, Query, Op);
1017       });
1018 
1019     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1020     // 64-bits.
1021     //
1022     // TODO: Should generalize bitcast action into coerce, which will also cover
1023     // inserting addrspacecasts.
1024     Actions.customIf(typeIs(1, Constant32Ptr));
1025 
1026     // Turn any illegal element vectors into something easier to deal
1027     // with. These will ultimately produce 32-bit scalar shifts to extract the
1028     // parts anyway.
1029     //
1030     // For odd 16-bit element vectors, prefer to split those into pieces with
1031     // 16-bit vector parts.
1032     Actions.bitcastIf(
1033       [=](const LegalityQuery &Query) -> bool {
1034         const LLT Ty = Query.Types[0];
1035         const unsigned Size = Ty.getSizeInBits();
1036 
1037         if (Size != Query.MMODescrs[0].SizeInBits)
1038           return Size <= 32 && Ty.isVector();
1039 
1040         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1041           return true;
1042         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1043                !isRegisterVectorElementType(Ty.getElementType());
1044       }, bitcastToRegisterType(0));
1045 
1046     Actions
1047         .customIf(typeIs(1, Constant32Ptr))
1048         // Widen suitably aligned loads by loading extra elements.
1049         .moreElementsIf([=](const LegalityQuery &Query) {
1050             const LLT Ty = Query.Types[0];
1051             return Op == G_LOAD && Ty.isVector() &&
1052                    shouldWidenLoadResult(Query, Op);
1053           }, moreElementsToNextPow2(0))
1054         .widenScalarIf([=](const LegalityQuery &Query) {
1055             const LLT Ty = Query.Types[0];
1056             return Op == G_LOAD && !Ty.isVector() &&
1057                    shouldWidenLoadResult(Query, Op);
1058           }, widenScalarOrEltToNextPow2(0))
1059         .narrowScalarIf(
1060             [=](const LegalityQuery &Query) -> bool {
1061               return !Query.Types[0].isVector() &&
1062                      needToSplitMemOp(Query, Op == G_LOAD);
1063             },
1064             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1065               const LLT DstTy = Query.Types[0];
1066               const LLT PtrTy = Query.Types[1];
1067 
1068               const unsigned DstSize = DstTy.getSizeInBits();
1069               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1070 
1071               // Split extloads.
1072               if (DstSize > MemSize)
1073                 return std::make_pair(0, LLT::scalar(MemSize));
1074 
1075               if (!isPowerOf2_32(DstSize)) {
1076                 // We're probably decomposing an odd sized store. Try to split
1077                 // to the widest type. TODO: Account for alignment. As-is it
1078                 // should be OK, since the new parts will be further legalized.
1079                 unsigned FloorSize = PowerOf2Floor(DstSize);
1080                 return std::make_pair(0, LLT::scalar(FloorSize));
1081               }
1082 
1083               if (DstSize > 32 && (DstSize % 32 != 0)) {
1084                 // FIXME: Need a way to specify non-extload of larger size if
1085                 // suitably aligned.
1086                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1087               }
1088 
1089               unsigned MaxSize = maxSizeForAddrSpace(ST,
1090                                                      PtrTy.getAddressSpace(),
1091                                                      Op == G_LOAD);
1092               if (MemSize > MaxSize)
1093                 return std::make_pair(0, LLT::scalar(MaxSize));
1094 
1095               unsigned Align = Query.MMODescrs[0].AlignInBits;
1096               return std::make_pair(0, LLT::scalar(Align));
1097             })
1098         .fewerElementsIf(
1099             [=](const LegalityQuery &Query) -> bool {
1100               return Query.Types[0].isVector() &&
1101                      needToSplitMemOp(Query, Op == G_LOAD);
1102             },
1103             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1104               const LLT DstTy = Query.Types[0];
1105               const LLT PtrTy = Query.Types[1];
1106 
1107               LLT EltTy = DstTy.getElementType();
1108               unsigned MaxSize = maxSizeForAddrSpace(ST,
1109                                                      PtrTy.getAddressSpace(),
1110                                                      Op == G_LOAD);
1111 
1112               // FIXME: Handle widened to power of 2 results better. This ends
1113               // up scalarizing.
1114               // FIXME: 3 element stores scalarized on SI
1115 
1116               // Split if it's too large for the address space.
1117               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1118                 unsigned NumElts = DstTy.getNumElements();
1119                 unsigned EltSize = EltTy.getSizeInBits();
1120 
1121                 if (MaxSize % EltSize == 0) {
1122                   return std::make_pair(
1123                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1124                 }
1125 
1126                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1127 
1128                 // FIXME: Refine when odd breakdowns handled
1129                 // The scalars will need to be re-legalized.
1130                 if (NumPieces == 1 || NumPieces >= NumElts ||
1131                     NumElts % NumPieces != 0)
1132                   return std::make_pair(0, EltTy);
1133 
1134                 return std::make_pair(0,
1135                                       LLT::vector(NumElts / NumPieces, EltTy));
1136               }
1137 
1138               // FIXME: We could probably handle weird extending loads better.
1139               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1140               if (DstTy.getSizeInBits() > MemSize)
1141                 return std::make_pair(0, EltTy);
1142 
1143               unsigned EltSize = EltTy.getSizeInBits();
1144               unsigned DstSize = DstTy.getSizeInBits();
1145               if (!isPowerOf2_32(DstSize)) {
1146                 // We're probably decomposing an odd sized store. Try to split
1147                 // to the widest type. TODO: Account for alignment. As-is it
1148                 // should be OK, since the new parts will be further legalized.
1149                 unsigned FloorSize = PowerOf2Floor(DstSize);
1150                 return std::make_pair(
1151                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1152               }
1153 
1154               // Need to split because of alignment.
1155               unsigned Align = Query.MMODescrs[0].AlignInBits;
1156               if (EltSize > Align &&
1157                   (EltSize / Align < DstTy.getNumElements())) {
1158                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1159               }
1160 
1161               // May need relegalization for the scalars.
1162               return std::make_pair(0, EltTy);
1163             })
1164         .minScalar(0, S32);
1165 
1166     if (IsStore)
1167       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1168 
1169     // TODO: Need a bitcast lower option?
1170     Actions
1171         .widenScalarToNextPow2(0)
1172         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1173   }
1174 
1175   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1176                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1177                                                   {S32, GlobalPtr, 16, 2 * 8},
1178                                                   {S32, LocalPtr, 8, 8},
1179                                                   {S32, LocalPtr, 16, 16},
1180                                                   {S32, PrivatePtr, 8, 8},
1181                                                   {S32, PrivatePtr, 16, 16},
1182                                                   {S32, ConstantPtr, 8, 8},
1183                                                   {S32, ConstantPtr, 16, 2 * 8}});
1184   if (ST.hasFlatAddressSpace()) {
1185     ExtLoads.legalForTypesWithMemDesc(
1186         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1187   }
1188 
1189   ExtLoads.clampScalar(0, S32, S32)
1190           .widenScalarToNextPow2(0)
1191           .unsupportedIfMemSizeNotPow2()
1192           .lower();
1193 
1194   auto &Atomics = getActionDefinitionsBuilder(
1195     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1196      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1197      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1198      G_ATOMICRMW_UMIN})
1199     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1200                {S64, GlobalPtr}, {S64, LocalPtr},
1201                {S32, RegionPtr}, {S64, RegionPtr}});
1202   if (ST.hasFlatAddressSpace()) {
1203     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1204   }
1205 
1206   if (ST.hasLDSFPAtomics()) {
1207     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1208       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1209   }
1210 
1211   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1212   // demarshalling
1213   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1214     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1215                 {S32, FlatPtr}, {S64, FlatPtr}})
1216     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1217                {S32, RegionPtr}, {S64, RegionPtr}});
1218   // TODO: Pointer types, any 32-bit or 64-bit vector
1219 
1220   // Condition should be s32 for scalar, s1 for vector.
1221   getActionDefinitionsBuilder(G_SELECT)
1222     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1223           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1224           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1225     .clampScalar(0, S16, S64)
1226     .scalarize(1)
1227     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1228     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1229     .clampMaxNumElements(0, S32, 2)
1230     .clampMaxNumElements(0, LocalPtr, 2)
1231     .clampMaxNumElements(0, PrivatePtr, 2)
1232     .scalarize(0)
1233     .widenScalarToNextPow2(0)
1234     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1235 
1236   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1237   // be more flexible with the shift amount type.
1238   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1239     .legalFor({{S32, S32}, {S64, S32}});
1240   if (ST.has16BitInsts()) {
1241     if (ST.hasVOP3PInsts()) {
1242       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1243             .clampMaxNumElements(0, S16, 2);
1244     } else
1245       Shifts.legalFor({{S16, S16}});
1246 
1247     // TODO: Support 16-bit shift amounts for all types
1248     Shifts.widenScalarIf(
1249       [=](const LegalityQuery &Query) {
1250         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1251         // 32-bit amount.
1252         const LLT ValTy = Query.Types[0];
1253         const LLT AmountTy = Query.Types[1];
1254         return ValTy.getSizeInBits() <= 16 &&
1255                AmountTy.getSizeInBits() < 16;
1256       }, changeTo(1, S16));
1257     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1258     Shifts.clampScalar(1, S32, S32);
1259     Shifts.clampScalar(0, S16, S64);
1260     Shifts.widenScalarToNextPow2(0, 16);
1261   } else {
1262     // Make sure we legalize the shift amount type first, as the general
1263     // expansion for the shifted type will produce much worse code if it hasn't
1264     // been truncated already.
1265     Shifts.clampScalar(1, S32, S32);
1266     Shifts.clampScalar(0, S32, S64);
1267     Shifts.widenScalarToNextPow2(0, 32);
1268   }
1269   Shifts.scalarize(0);
1270 
1271   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1272     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1273     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1274     unsigned IdxTypeIdx = 2;
1275 
1276     getActionDefinitionsBuilder(Op)
1277       .customIf([=](const LegalityQuery &Query) {
1278           const LLT EltTy = Query.Types[EltTypeIdx];
1279           const LLT VecTy = Query.Types[VecTypeIdx];
1280           const LLT IdxTy = Query.Types[IdxTypeIdx];
1281           return (EltTy.getSizeInBits() == 16 ||
1282                   EltTy.getSizeInBits() % 32 == 0) &&
1283                  VecTy.getSizeInBits() % 32 == 0 &&
1284                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1285                  IdxTy.getSizeInBits() == 32;
1286         })
1287       .clampScalar(EltTypeIdx, S32, S64)
1288       .clampScalar(VecTypeIdx, S32, S64)
1289       .clampScalar(IdxTypeIdx, S32, S32);
1290   }
1291 
1292   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1293     .unsupportedIf([=](const LegalityQuery &Query) {
1294         const LLT &EltTy = Query.Types[1].getElementType();
1295         return Query.Types[0] != EltTy;
1296       });
1297 
1298   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1299     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1300     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1301 
1302     // FIXME: Doesn't handle extract of illegal sizes.
1303     getActionDefinitionsBuilder(Op)
1304       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1305       // FIXME: Multiples of 16 should not be legal.
1306       .legalIf([=](const LegalityQuery &Query) {
1307           const LLT BigTy = Query.Types[BigTyIdx];
1308           const LLT LitTy = Query.Types[LitTyIdx];
1309           return (BigTy.getSizeInBits() % 32 == 0) &&
1310                  (LitTy.getSizeInBits() % 16 == 0);
1311         })
1312       .widenScalarIf(
1313         [=](const LegalityQuery &Query) {
1314           const LLT BigTy = Query.Types[BigTyIdx];
1315           return (BigTy.getScalarSizeInBits() < 16);
1316         },
1317         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1318       .widenScalarIf(
1319         [=](const LegalityQuery &Query) {
1320           const LLT LitTy = Query.Types[LitTyIdx];
1321           return (LitTy.getScalarSizeInBits() < 16);
1322         },
1323         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1324       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1325       .widenScalarToNextPow2(BigTyIdx, 32);
1326 
1327   }
1328 
1329   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1330     .legalForCartesianProduct(AllS32Vectors, {S32})
1331     .legalForCartesianProduct(AllS64Vectors, {S64})
1332     .clampNumElements(0, V16S32, V32S32)
1333     .clampNumElements(0, V2S64, V16S64)
1334     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1335 
1336   if (ST.hasScalarPackInsts()) {
1337     BuildVector
1338       // FIXME: Should probably widen s1 vectors straight to s32
1339       .minScalarOrElt(0, S16)
1340       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1341       .minScalar(1, S32);
1342 
1343     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1344       .legalFor({V2S16, S32})
1345       .lower();
1346     BuildVector.minScalarOrElt(0, S32);
1347   } else {
1348     BuildVector.customFor({V2S16, S16});
1349     BuildVector.minScalarOrElt(0, S32);
1350 
1351     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1352       .customFor({V2S16, S32})
1353       .lower();
1354   }
1355 
1356   BuildVector.legalIf(isRegisterType(0));
1357 
1358   // FIXME: Clamp maximum size
1359   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1360     .legalIf(isRegisterType(0));
1361 
1362   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1363   // pre-legalize.
1364   if (ST.hasVOP3PInsts()) {
1365     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1366       .customFor({V2S16, V2S16})
1367       .lower();
1368   } else
1369     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1370 
1371   // Merge/Unmerge
1372   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1373     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1374     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1375 
1376     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1377       const LLT Ty = Query.Types[TypeIdx];
1378       if (Ty.isVector()) {
1379         const LLT &EltTy = Ty.getElementType();
1380         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1381           return true;
1382         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1383           return true;
1384       }
1385       return false;
1386     };
1387 
1388     auto &Builder = getActionDefinitionsBuilder(Op)
1389       .lowerFor({{S16, V2S16}})
1390       .lowerIf([=](const LegalityQuery &Query) {
1391           const LLT BigTy = Query.Types[BigTyIdx];
1392           return BigTy.getSizeInBits() == 32;
1393         })
1394       // Try to widen to s16 first for small types.
1395       // TODO: Only do this on targets with legal s16 shifts
1396       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1397       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1398       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1399       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1400                            elementTypeIs(1, S16)),
1401                        changeTo(1, V2S16))
1402       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1403       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1404       // valid.
1405       .clampScalar(LitTyIdx, S32, S512)
1406       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1407       // Break up vectors with weird elements into scalars
1408       .fewerElementsIf(
1409         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1410         scalarize(0))
1411       .fewerElementsIf(
1412         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1413         scalarize(1))
1414       .clampScalar(BigTyIdx, S32, MaxScalar);
1415 
1416     if (Op == G_MERGE_VALUES) {
1417       Builder.widenScalarIf(
1418         // TODO: Use 16-bit shifts if legal for 8-bit values?
1419         [=](const LegalityQuery &Query) {
1420           const LLT Ty = Query.Types[LitTyIdx];
1421           return Ty.getSizeInBits() < 32;
1422         },
1423         changeTo(LitTyIdx, S32));
1424     }
1425 
1426     Builder.widenScalarIf(
1427       [=](const LegalityQuery &Query) {
1428         const LLT Ty = Query.Types[BigTyIdx];
1429         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1430           Ty.getSizeInBits() % 16 != 0;
1431       },
1432       [=](const LegalityQuery &Query) {
1433         // Pick the next power of 2, or a multiple of 64 over 128.
1434         // Whichever is smaller.
1435         const LLT &Ty = Query.Types[BigTyIdx];
1436         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1437         if (NewSizeInBits >= 256) {
1438           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1439           if (RoundedTo < NewSizeInBits)
1440             NewSizeInBits = RoundedTo;
1441         }
1442         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1443       })
1444       .legalIf([=](const LegalityQuery &Query) {
1445           const LLT &BigTy = Query.Types[BigTyIdx];
1446           const LLT &LitTy = Query.Types[LitTyIdx];
1447 
1448           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1449             return false;
1450           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1451             return false;
1452 
1453           return BigTy.getSizeInBits() % 16 == 0 &&
1454                  LitTy.getSizeInBits() % 16 == 0 &&
1455                  BigTy.getSizeInBits() <= MaxRegisterSize;
1456         })
1457       // Any vectors left are the wrong size. Scalarize them.
1458       .scalarize(0)
1459       .scalarize(1);
1460   }
1461 
1462   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1463   // RegBankSelect.
1464   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1465     .legalFor({{S32}, {S64}});
1466 
1467   if (ST.hasVOP3PInsts()) {
1468     SextInReg.lowerFor({{V2S16}})
1469       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1470       // get more vector shift opportunities, since we'll get those when
1471       // expanded.
1472       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1473   } else if (ST.has16BitInsts()) {
1474     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1475   } else {
1476     // Prefer to promote to s32 before lowering if we don't have 16-bit
1477     // shifts. This avoid a lot of intermediate truncate and extend operations.
1478     SextInReg.lowerFor({{S32}, {S64}});
1479   }
1480 
1481   SextInReg
1482     .scalarize(0)
1483     .clampScalar(0, S32, S64)
1484     .lower();
1485 
1486   getActionDefinitionsBuilder(G_FSHR)
1487     .legalFor({{S32, S32}})
1488     .scalarize(0)
1489     .lower();
1490 
1491   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1492     .legalFor({S64});
1493 
1494   getActionDefinitionsBuilder({
1495       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1496       G_FCOPYSIGN,
1497 
1498       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1499       G_ATOMICRMW_NAND,
1500       G_ATOMICRMW_FSUB,
1501       G_READ_REGISTER,
1502       G_WRITE_REGISTER,
1503 
1504       G_SADDO, G_SSUBO,
1505 
1506        // TODO: Implement
1507       G_FMINIMUM, G_FMAXIMUM,
1508       G_FSHL
1509     }).lower();
1510 
1511   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1512         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1513         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1514     .unsupported();
1515 
1516   computeTables();
1517   verify(*ST.getInstrInfo());
1518 }
1519 
1520 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1521                                          MachineInstr &MI) const {
1522   MachineIRBuilder &B = Helper.MIRBuilder;
1523   MachineRegisterInfo &MRI = *B.getMRI();
1524   GISelChangeObserver &Observer = Helper.Observer;
1525 
1526   switch (MI.getOpcode()) {
1527   case TargetOpcode::G_ADDRSPACE_CAST:
1528     return legalizeAddrSpaceCast(MI, MRI, B);
1529   case TargetOpcode::G_FRINT:
1530     return legalizeFrint(MI, MRI, B);
1531   case TargetOpcode::G_FCEIL:
1532     return legalizeFceil(MI, MRI, B);
1533   case TargetOpcode::G_INTRINSIC_TRUNC:
1534     return legalizeIntrinsicTrunc(MI, MRI, B);
1535   case TargetOpcode::G_SITOFP:
1536     return legalizeITOFP(MI, MRI, B, true);
1537   case TargetOpcode::G_UITOFP:
1538     return legalizeITOFP(MI, MRI, B, false);
1539   case TargetOpcode::G_FPTOSI:
1540     return legalizeFPTOI(MI, MRI, B, true);
1541   case TargetOpcode::G_FPTOUI:
1542     return legalizeFPTOI(MI, MRI, B, false);
1543   case TargetOpcode::G_FMINNUM:
1544   case TargetOpcode::G_FMAXNUM:
1545   case TargetOpcode::G_FMINNUM_IEEE:
1546   case TargetOpcode::G_FMAXNUM_IEEE:
1547     return legalizeMinNumMaxNum(Helper, MI);
1548   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1549     return legalizeExtractVectorElt(MI, MRI, B);
1550   case TargetOpcode::G_INSERT_VECTOR_ELT:
1551     return legalizeInsertVectorElt(MI, MRI, B);
1552   case TargetOpcode::G_SHUFFLE_VECTOR:
1553     return legalizeShuffleVector(MI, MRI, B);
1554   case TargetOpcode::G_FSIN:
1555   case TargetOpcode::G_FCOS:
1556     return legalizeSinCos(MI, MRI, B);
1557   case TargetOpcode::G_GLOBAL_VALUE:
1558     return legalizeGlobalValue(MI, MRI, B);
1559   case TargetOpcode::G_LOAD:
1560     return legalizeLoad(MI, MRI, B, Observer);
1561   case TargetOpcode::G_FMAD:
1562     return legalizeFMad(MI, MRI, B);
1563   case TargetOpcode::G_FDIV:
1564     return legalizeFDIV(MI, MRI, B);
1565   case TargetOpcode::G_UDIV:
1566   case TargetOpcode::G_UREM:
1567     return legalizeUDIV_UREM(MI, MRI, B);
1568   case TargetOpcode::G_SDIV:
1569   case TargetOpcode::G_SREM:
1570     return legalizeSDIV_SREM(MI, MRI, B);
1571   case TargetOpcode::G_ATOMIC_CMPXCHG:
1572     return legalizeAtomicCmpXChg(MI, MRI, B);
1573   case TargetOpcode::G_FLOG:
1574     return legalizeFlog(MI, B, numbers::ln2f);
1575   case TargetOpcode::G_FLOG10:
1576     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1577   case TargetOpcode::G_FEXP:
1578     return legalizeFExp(MI, B);
1579   case TargetOpcode::G_FPOW:
1580     return legalizeFPow(MI, B);
1581   case TargetOpcode::G_FFLOOR:
1582     return legalizeFFloor(MI, MRI, B);
1583   case TargetOpcode::G_BUILD_VECTOR:
1584     return legalizeBuildVector(MI, MRI, B);
1585   default:
1586     return false;
1587   }
1588 
1589   llvm_unreachable("expected switch to return");
1590 }
1591 
1592 Register AMDGPULegalizerInfo::getSegmentAperture(
1593   unsigned AS,
1594   MachineRegisterInfo &MRI,
1595   MachineIRBuilder &B) const {
1596   MachineFunction &MF = B.getMF();
1597   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1598   const LLT S32 = LLT::scalar(32);
1599 
1600   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1601 
1602   if (ST.hasApertureRegs()) {
1603     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1604     // getreg.
1605     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1606         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1607         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1608     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1609         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1610         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1611     unsigned Encoding =
1612         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1613         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1614         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1615 
1616     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1617 
1618     B.buildInstr(AMDGPU::S_GETREG_B32)
1619       .addDef(GetReg)
1620       .addImm(Encoding);
1621     MRI.setType(GetReg, S32);
1622 
1623     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1624     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1625   }
1626 
1627   Register QueuePtr = MRI.createGenericVirtualRegister(
1628     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1629 
1630   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1631   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1632     return Register();
1633 
1634   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1635   // private_segment_aperture_base_hi.
1636   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1637 
1638   // TODO: can we be smarter about machine pointer info?
1639   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1640   MachineMemOperand *MMO = MF.getMachineMemOperand(
1641       PtrInfo,
1642       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1643           MachineMemOperand::MOInvariant,
1644       4, commonAlignment(Align(64), StructOffset));
1645 
1646   Register LoadAddr;
1647 
1648   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1649   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1650 }
1651 
1652 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1653   MachineInstr &MI, MachineRegisterInfo &MRI,
1654   MachineIRBuilder &B) const {
1655   MachineFunction &MF = B.getMF();
1656 
1657   const LLT S32 = LLT::scalar(32);
1658   Register Dst = MI.getOperand(0).getReg();
1659   Register Src = MI.getOperand(1).getReg();
1660 
1661   LLT DstTy = MRI.getType(Dst);
1662   LLT SrcTy = MRI.getType(Src);
1663   unsigned DestAS = DstTy.getAddressSpace();
1664   unsigned SrcAS = SrcTy.getAddressSpace();
1665 
1666   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1667   // vector element.
1668   assert(!DstTy.isVector());
1669 
1670   const AMDGPUTargetMachine &TM
1671     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1672 
1673   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1674   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1675     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1676     return true;
1677   }
1678 
1679   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1680     // Truncate.
1681     B.buildExtract(Dst, Src, 0);
1682     MI.eraseFromParent();
1683     return true;
1684   }
1685 
1686   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1687     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1688     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1689 
1690     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1691     // another. Merge operands are required to be the same type, but creating an
1692     // extra ptrtoint would be kind of pointless.
1693     auto HighAddr = B.buildConstant(
1694       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1695     B.buildMerge(Dst, {Src, HighAddr});
1696     MI.eraseFromParent();
1697     return true;
1698   }
1699 
1700   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1701     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1702            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1703     unsigned NullVal = TM.getNullPointerValue(DestAS);
1704 
1705     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1706     auto FlatNull = B.buildConstant(SrcTy, 0);
1707 
1708     // Extract low 32-bits of the pointer.
1709     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1710 
1711     auto CmpRes =
1712         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1713     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1714 
1715     MI.eraseFromParent();
1716     return true;
1717   }
1718 
1719   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1720     return false;
1721 
1722   if (!ST.hasFlatAddressSpace())
1723     return false;
1724 
1725   auto SegmentNull =
1726       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1727   auto FlatNull =
1728       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1729 
1730   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1731   if (!ApertureReg.isValid())
1732     return false;
1733 
1734   auto CmpRes =
1735       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1736 
1737   // Coerce the type of the low half of the result so we can use merge_values.
1738   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1739 
1740   // TODO: Should we allow mismatched types but matching sizes in merges to
1741   // avoid the ptrtoint?
1742   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1743   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1744 
1745   MI.eraseFromParent();
1746   return true;
1747 }
1748 
1749 bool AMDGPULegalizerInfo::legalizeFrint(
1750   MachineInstr &MI, MachineRegisterInfo &MRI,
1751   MachineIRBuilder &B) const {
1752   Register Src = MI.getOperand(1).getReg();
1753   LLT Ty = MRI.getType(Src);
1754   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1755 
1756   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1757   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1758 
1759   auto C1 = B.buildFConstant(Ty, C1Val);
1760   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1761 
1762   // TODO: Should this propagate fast-math-flags?
1763   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1764   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1765 
1766   auto C2 = B.buildFConstant(Ty, C2Val);
1767   auto Fabs = B.buildFAbs(Ty, Src);
1768 
1769   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1770   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1771   MI.eraseFromParent();
1772   return true;
1773 }
1774 
1775 bool AMDGPULegalizerInfo::legalizeFceil(
1776   MachineInstr &MI, MachineRegisterInfo &MRI,
1777   MachineIRBuilder &B) const {
1778 
1779   const LLT S1 = LLT::scalar(1);
1780   const LLT S64 = LLT::scalar(64);
1781 
1782   Register Src = MI.getOperand(1).getReg();
1783   assert(MRI.getType(Src) == S64);
1784 
1785   // result = trunc(src)
1786   // if (src > 0.0 && src != result)
1787   //   result += 1.0
1788 
1789   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1790 
1791   const auto Zero = B.buildFConstant(S64, 0.0);
1792   const auto One = B.buildFConstant(S64, 1.0);
1793   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1794   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1795   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1796   auto Add = B.buildSelect(S64, And, One, Zero);
1797 
1798   // TODO: Should this propagate fast-math-flags?
1799   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1800   return true;
1801 }
1802 
1803 static MachineInstrBuilder extractF64Exponent(Register Hi,
1804                                               MachineIRBuilder &B) {
1805   const unsigned FractBits = 52;
1806   const unsigned ExpBits = 11;
1807   LLT S32 = LLT::scalar(32);
1808 
1809   auto Const0 = B.buildConstant(S32, FractBits - 32);
1810   auto Const1 = B.buildConstant(S32, ExpBits);
1811 
1812   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1813     .addUse(Hi)
1814     .addUse(Const0.getReg(0))
1815     .addUse(Const1.getReg(0));
1816 
1817   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1818 }
1819 
1820 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1821   MachineInstr &MI, MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   const LLT S1 = LLT::scalar(1);
1824   const LLT S32 = LLT::scalar(32);
1825   const LLT S64 = LLT::scalar(64);
1826 
1827   Register Src = MI.getOperand(1).getReg();
1828   assert(MRI.getType(Src) == S64);
1829 
1830   // TODO: Should this use extract since the low half is unused?
1831   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1832   Register Hi = Unmerge.getReg(1);
1833 
1834   // Extract the upper half, since this is where we will find the sign and
1835   // exponent.
1836   auto Exp = extractF64Exponent(Hi, B);
1837 
1838   const unsigned FractBits = 52;
1839 
1840   // Extract the sign bit.
1841   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1842   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1843 
1844   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1845 
1846   const auto Zero32 = B.buildConstant(S32, 0);
1847 
1848   // Extend back to 64-bits.
1849   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1850 
1851   auto Shr = B.buildAShr(S64, FractMask, Exp);
1852   auto Not = B.buildNot(S64, Shr);
1853   auto Tmp0 = B.buildAnd(S64, Src, Not);
1854   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1855 
1856   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1857   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1858 
1859   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1860   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1861   MI.eraseFromParent();
1862   return true;
1863 }
1864 
1865 bool AMDGPULegalizerInfo::legalizeITOFP(
1866   MachineInstr &MI, MachineRegisterInfo &MRI,
1867   MachineIRBuilder &B, bool Signed) const {
1868 
1869   Register Dst = MI.getOperand(0).getReg();
1870   Register Src = MI.getOperand(1).getReg();
1871 
1872   const LLT S64 = LLT::scalar(64);
1873   const LLT S32 = LLT::scalar(32);
1874 
1875   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1876 
1877   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1878 
1879   auto CvtHi = Signed ?
1880     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1881     B.buildUITOFP(S64, Unmerge.getReg(1));
1882 
1883   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1884 
1885   auto ThirtyTwo = B.buildConstant(S32, 32);
1886   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1887     .addUse(CvtHi.getReg(0))
1888     .addUse(ThirtyTwo.getReg(0));
1889 
1890   // TODO: Should this propagate fast-math-flags?
1891   B.buildFAdd(Dst, LdExp, CvtLo);
1892   MI.eraseFromParent();
1893   return true;
1894 }
1895 
1896 // TODO: Copied from DAG implementation. Verify logic and document how this
1897 // actually works.
1898 bool AMDGPULegalizerInfo::legalizeFPTOI(
1899   MachineInstr &MI, MachineRegisterInfo &MRI,
1900   MachineIRBuilder &B, bool Signed) const {
1901 
1902   Register Dst = MI.getOperand(0).getReg();
1903   Register Src = MI.getOperand(1).getReg();
1904 
1905   const LLT S64 = LLT::scalar(64);
1906   const LLT S32 = LLT::scalar(32);
1907 
1908   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1909 
1910   unsigned Flags = MI.getFlags();
1911 
1912   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1913   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1914   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1915 
1916   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1917   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1918   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1919 
1920   auto Hi = Signed ?
1921     B.buildFPTOSI(S32, FloorMul) :
1922     B.buildFPTOUI(S32, FloorMul);
1923   auto Lo = B.buildFPTOUI(S32, Fma);
1924 
1925   B.buildMerge(Dst, { Lo, Hi });
1926   MI.eraseFromParent();
1927 
1928   return true;
1929 }
1930 
1931 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1932                                                MachineInstr &MI) const {
1933   MachineFunction &MF = Helper.MIRBuilder.getMF();
1934   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1935 
1936   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1937                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1938 
1939   // With ieee_mode disabled, the instructions have the correct behavior
1940   // already for G_FMINNUM/G_FMAXNUM
1941   if (!MFI->getMode().IEEE)
1942     return !IsIEEEOp;
1943 
1944   if (IsIEEEOp)
1945     return true;
1946 
1947   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1948 }
1949 
1950 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1951   MachineInstr &MI, MachineRegisterInfo &MRI,
1952   MachineIRBuilder &B) const {
1953   // TODO: Should move some of this into LegalizerHelper.
1954 
1955   // TODO: Promote dynamic indexing of s16 to s32
1956 
1957   // FIXME: Artifact combiner probably should have replaced the truncated
1958   // constant before this, so we shouldn't need
1959   // getConstantVRegValWithLookThrough.
1960   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1961     MI.getOperand(2).getReg(), MRI);
1962   if (!IdxVal) // Dynamic case will be selected to register indexing.
1963     return true;
1964 
1965   Register Dst = MI.getOperand(0).getReg();
1966   Register Vec = MI.getOperand(1).getReg();
1967 
1968   LLT VecTy = MRI.getType(Vec);
1969   LLT EltTy = VecTy.getElementType();
1970   assert(EltTy == MRI.getType(Dst));
1971 
1972   if (IdxVal->Value < VecTy.getNumElements())
1973     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1974   else
1975     B.buildUndef(Dst);
1976 
1977   MI.eraseFromParent();
1978   return true;
1979 }
1980 
1981 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1982   MachineInstr &MI, MachineRegisterInfo &MRI,
1983   MachineIRBuilder &B) const {
1984   // TODO: Should move some of this into LegalizerHelper.
1985 
1986   // TODO: Promote dynamic indexing of s16 to s32
1987 
1988   // FIXME: Artifact combiner probably should have replaced the truncated
1989   // constant before this, so we shouldn't need
1990   // getConstantVRegValWithLookThrough.
1991   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1992     MI.getOperand(3).getReg(), MRI);
1993   if (!IdxVal) // Dynamic case will be selected to register indexing.
1994     return true;
1995 
1996   Register Dst = MI.getOperand(0).getReg();
1997   Register Vec = MI.getOperand(1).getReg();
1998   Register Ins = MI.getOperand(2).getReg();
1999 
2000   LLT VecTy = MRI.getType(Vec);
2001   LLT EltTy = VecTy.getElementType();
2002   assert(EltTy == MRI.getType(Ins));
2003 
2004   if (IdxVal->Value < VecTy.getNumElements())
2005     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2006   else
2007     B.buildUndef(Dst);
2008 
2009   MI.eraseFromParent();
2010   return true;
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016   const LLT V2S16 = LLT::vector(2, 16);
2017 
2018   Register Dst = MI.getOperand(0).getReg();
2019   Register Src0 = MI.getOperand(1).getReg();
2020   LLT DstTy = MRI.getType(Dst);
2021   LLT SrcTy = MRI.getType(Src0);
2022 
2023   if (SrcTy == V2S16 && DstTy == V2S16 &&
2024       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2025     return true;
2026 
2027   MachineIRBuilder HelperBuilder(MI);
2028   GISelObserverWrapper DummyObserver;
2029   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2030   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2031 }
2032 
2033 bool AMDGPULegalizerInfo::legalizeSinCos(
2034   MachineInstr &MI, MachineRegisterInfo &MRI,
2035   MachineIRBuilder &B) const {
2036 
2037   Register DstReg = MI.getOperand(0).getReg();
2038   Register SrcReg = MI.getOperand(1).getReg();
2039   LLT Ty = MRI.getType(DstReg);
2040   unsigned Flags = MI.getFlags();
2041 
2042   Register TrigVal;
2043   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2044   if (ST.hasTrigReducedRange()) {
2045     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2046     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2047       .addUse(MulVal.getReg(0))
2048       .setMIFlags(Flags).getReg(0);
2049   } else
2050     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2051 
2052   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2053     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2054   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2055     .addUse(TrigVal)
2056     .setMIFlags(Flags);
2057   MI.eraseFromParent();
2058   return true;
2059 }
2060 
2061 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2062                                                   MachineIRBuilder &B,
2063                                                   const GlobalValue *GV,
2064                                                   int64_t Offset,
2065                                                   unsigned GAFlags) const {
2066   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2067   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2068   // to the following code sequence:
2069   //
2070   // For constant address space:
2071   //   s_getpc_b64 s[0:1]
2072   //   s_add_u32 s0, s0, $symbol
2073   //   s_addc_u32 s1, s1, 0
2074   //
2075   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2076   //   a fixup or relocation is emitted to replace $symbol with a literal
2077   //   constant, which is a pc-relative offset from the encoding of the $symbol
2078   //   operand to the global variable.
2079   //
2080   // For global address space:
2081   //   s_getpc_b64 s[0:1]
2082   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2083   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2084   //
2085   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2086   //   fixups or relocations are emitted to replace $symbol@*@lo and
2087   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2088   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2089   //   operand to the global variable.
2090   //
2091   // What we want here is an offset from the value returned by s_getpc
2092   // (which is the address of the s_add_u32 instruction) to the global
2093   // variable, but since the encoding of $symbol starts 4 bytes after the start
2094   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2095   // small. This requires us to add 4 to the global variable offset in order to
2096   // compute the correct address.
2097 
2098   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2099 
2100   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2101     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2102 
2103   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2104     .addDef(PCReg);
2105 
2106   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2107   if (GAFlags == SIInstrInfo::MO_NONE)
2108     MIB.addImm(0);
2109   else
2110     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2111 
2112   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2113 
2114   if (PtrTy.getSizeInBits() == 32)
2115     B.buildExtract(DstReg, PCReg, 0);
2116   return true;
2117  }
2118 
2119 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2120   MachineInstr &MI, MachineRegisterInfo &MRI,
2121   MachineIRBuilder &B) const {
2122   Register DstReg = MI.getOperand(0).getReg();
2123   LLT Ty = MRI.getType(DstReg);
2124   unsigned AS = Ty.getAddressSpace();
2125 
2126   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2127   MachineFunction &MF = B.getMF();
2128   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2129 
2130   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2131     if (!MFI->isEntryFunction()) {
2132       const Function &Fn = MF.getFunction();
2133       DiagnosticInfoUnsupported BadLDSDecl(
2134         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2135         DS_Warning);
2136       Fn.getContext().diagnose(BadLDSDecl);
2137 
2138       // We currently don't have a way to correctly allocate LDS objects that
2139       // aren't directly associated with a kernel. We do force inlining of
2140       // functions that use local objects. However, if these dead functions are
2141       // not eliminated, we don't want a compile time error. Just emit a warning
2142       // and a trap, since there should be no callable path here.
2143       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2144       B.buildUndef(DstReg);
2145       MI.eraseFromParent();
2146       return true;
2147     }
2148 
2149     // TODO: We could emit code to handle the initialization somewhere.
2150     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2151       const SITargetLowering *TLI = ST.getTargetLowering();
2152       if (!TLI->shouldUseLDSConstAddress(GV)) {
2153         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2154         return true; // Leave in place;
2155       }
2156 
2157       B.buildConstant(
2158           DstReg,
2159           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2160       MI.eraseFromParent();
2161       return true;
2162     }
2163 
2164     const Function &Fn = MF.getFunction();
2165     DiagnosticInfoUnsupported BadInit(
2166       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2167     Fn.getContext().diagnose(BadInit);
2168     return true;
2169   }
2170 
2171   const SITargetLowering *TLI = ST.getTargetLowering();
2172 
2173   if (TLI->shouldEmitFixup(GV)) {
2174     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2175     MI.eraseFromParent();
2176     return true;
2177   }
2178 
2179   if (TLI->shouldEmitPCReloc(GV)) {
2180     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2181     MI.eraseFromParent();
2182     return true;
2183   }
2184 
2185   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2186   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2187 
2188   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2189       MachinePointerInfo::getGOT(MF),
2190       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2191           MachineMemOperand::MOInvariant,
2192       8 /*Size*/, Align(8));
2193 
2194   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2195 
2196   if (Ty.getSizeInBits() == 32) {
2197     // Truncate if this is a 32-bit constant adrdess.
2198     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2199     B.buildExtract(DstReg, Load, 0);
2200   } else
2201     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2202 
2203   MI.eraseFromParent();
2204   return true;
2205 }
2206 
2207 bool AMDGPULegalizerInfo::legalizeLoad(
2208   MachineInstr &MI, MachineRegisterInfo &MRI,
2209   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2210   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2211   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2212   Observer.changingInstr(MI);
2213   MI.getOperand(1).setReg(Cast.getReg(0));
2214   Observer.changedInstr(MI);
2215   return true;
2216 }
2217 
2218 bool AMDGPULegalizerInfo::legalizeFMad(
2219   MachineInstr &MI, MachineRegisterInfo &MRI,
2220   MachineIRBuilder &B) const {
2221   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2222   assert(Ty.isScalar());
2223 
2224   MachineFunction &MF = B.getMF();
2225   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2226 
2227   // TODO: Always legal with future ftz flag.
2228   // FIXME: Do we need just output?
2229   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2230     return true;
2231   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2232     return true;
2233 
2234   MachineIRBuilder HelperBuilder(MI);
2235   GISelObserverWrapper DummyObserver;
2236   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2237   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2241   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2242   Register DstReg = MI.getOperand(0).getReg();
2243   Register PtrReg = MI.getOperand(1).getReg();
2244   Register CmpVal = MI.getOperand(2).getReg();
2245   Register NewVal = MI.getOperand(3).getReg();
2246 
2247   assert(SITargetLowering::isFlatGlobalAddrSpace(
2248            MRI.getType(PtrReg).getAddressSpace()) &&
2249          "this should not have been custom lowered");
2250 
2251   LLT ValTy = MRI.getType(CmpVal);
2252   LLT VecTy = LLT::vector(2, ValTy);
2253 
2254   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2255 
2256   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2257     .addDef(DstReg)
2258     .addUse(PtrReg)
2259     .addUse(PackedVal)
2260     .setMemRefs(MI.memoperands());
2261 
2262   MI.eraseFromParent();
2263   return true;
2264 }
2265 
2266 bool AMDGPULegalizerInfo::legalizeFlog(
2267   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2268   Register Dst = MI.getOperand(0).getReg();
2269   Register Src = MI.getOperand(1).getReg();
2270   LLT Ty = B.getMRI()->getType(Dst);
2271   unsigned Flags = MI.getFlags();
2272 
2273   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2274   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2275 
2276   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2277   MI.eraseFromParent();
2278   return true;
2279 }
2280 
2281 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2282                                        MachineIRBuilder &B) const {
2283   Register Dst = MI.getOperand(0).getReg();
2284   Register Src = MI.getOperand(1).getReg();
2285   unsigned Flags = MI.getFlags();
2286   LLT Ty = B.getMRI()->getType(Dst);
2287 
2288   auto K = B.buildFConstant(Ty, numbers::log2e);
2289   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2290   B.buildFExp2(Dst, Mul, Flags);
2291   MI.eraseFromParent();
2292   return true;
2293 }
2294 
2295 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2296                                        MachineIRBuilder &B) const {
2297   Register Dst = MI.getOperand(0).getReg();
2298   Register Src0 = MI.getOperand(1).getReg();
2299   Register Src1 = MI.getOperand(2).getReg();
2300   unsigned Flags = MI.getFlags();
2301   LLT Ty = B.getMRI()->getType(Dst);
2302   const LLT S16 = LLT::scalar(16);
2303   const LLT S32 = LLT::scalar(32);
2304 
2305   if (Ty == S32) {
2306     auto Log = B.buildFLog2(S32, Src0, Flags);
2307     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2308       .addUse(Log.getReg(0))
2309       .addUse(Src1)
2310       .setMIFlags(Flags);
2311     B.buildFExp2(Dst, Mul, Flags);
2312   } else if (Ty == S16) {
2313     // There's no f16 fmul_legacy, so we need to convert for it.
2314     auto Log = B.buildFLog2(S16, Src0, Flags);
2315     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2316     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2317     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2318       .addUse(Ext0.getReg(0))
2319       .addUse(Ext1.getReg(0))
2320       .setMIFlags(Flags);
2321 
2322     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2323   } else
2324     return false;
2325 
2326   MI.eraseFromParent();
2327   return true;
2328 }
2329 
2330 // Find a source register, ignoring any possible source modifiers.
2331 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2332   Register ModSrc = OrigSrc;
2333   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2334     ModSrc = SrcFNeg->getOperand(1).getReg();
2335     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2336       ModSrc = SrcFAbs->getOperand(1).getReg();
2337   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2338     ModSrc = SrcFAbs->getOperand(1).getReg();
2339   return ModSrc;
2340 }
2341 
2342 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2343                                          MachineRegisterInfo &MRI,
2344                                          MachineIRBuilder &B) const {
2345 
2346   const LLT S1 = LLT::scalar(1);
2347   const LLT S64 = LLT::scalar(64);
2348   Register Dst = MI.getOperand(0).getReg();
2349   Register OrigSrc = MI.getOperand(1).getReg();
2350   unsigned Flags = MI.getFlags();
2351   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2352          "this should not have been custom lowered");
2353 
2354   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2355   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2356   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2357   // V_FRACT bug is:
2358   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2359   //
2360   // Convert floor(x) to (x - fract(x))
2361 
2362   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2363     .addUse(OrigSrc)
2364     .setMIFlags(Flags);
2365 
2366   // Give source modifier matching some assistance before obscuring a foldable
2367   // pattern.
2368 
2369   // TODO: We can avoid the neg on the fract? The input sign to fract
2370   // shouldn't matter?
2371   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2372 
2373   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2374 
2375   Register Min = MRI.createGenericVirtualRegister(S64);
2376 
2377   // We don't need to concern ourselves with the snan handling difference, so
2378   // use the one which will directly select.
2379   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2380   if (MFI->getMode().IEEE)
2381     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2382   else
2383     B.buildFMinNum(Min, Fract, Const, Flags);
2384 
2385   Register CorrectedFract = Min;
2386   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2387     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2388     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2389   }
2390 
2391   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2392   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2393 
2394   MI.eraseFromParent();
2395   return true;
2396 }
2397 
2398 // Turn an illegal packed v2s16 build vector into bit operations.
2399 // TODO: This should probably be a bitcast action in LegalizerHelper.
2400 bool AMDGPULegalizerInfo::legalizeBuildVector(
2401   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2402   Register Dst = MI.getOperand(0).getReg();
2403   const LLT S32 = LLT::scalar(32);
2404   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2405 
2406   Register Src0 = MI.getOperand(1).getReg();
2407   Register Src1 = MI.getOperand(2).getReg();
2408   assert(MRI.getType(Src0) == LLT::scalar(16));
2409 
2410   auto Merge = B.buildMerge(S32, {Src0, Src1});
2411   B.buildBitcast(Dst, Merge);
2412 
2413   MI.eraseFromParent();
2414   return true;
2415 }
2416 
2417 // Return the use branch instruction, otherwise null if the usage is invalid.
2418 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2419                                        MachineRegisterInfo &MRI,
2420                                        MachineInstr *&Br,
2421                                        MachineBasicBlock *&UncondBrTarget) {
2422   Register CondDef = MI.getOperand(0).getReg();
2423   if (!MRI.hasOneNonDBGUse(CondDef))
2424     return nullptr;
2425 
2426   MachineBasicBlock *Parent = MI.getParent();
2427   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2428   if (UseMI.getParent() != Parent ||
2429       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2430     return nullptr;
2431 
2432   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2433   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2434   if (Next == Parent->end()) {
2435     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2436     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2437       return nullptr;
2438     UncondBrTarget = &*NextMBB;
2439   } else {
2440     if (Next->getOpcode() != AMDGPU::G_BR)
2441       return nullptr;
2442     Br = &*Next;
2443     UncondBrTarget = Br->getOperand(0).getMBB();
2444   }
2445 
2446   return &UseMI;
2447 }
2448 
2449 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2450                                                MachineRegisterInfo &MRI,
2451                                                Register LiveIn,
2452                                                Register PhyReg) const {
2453   assert(PhyReg.isPhysical() && "Physical register expected");
2454 
2455   // Insert the live-in copy, if required, by defining destination virtual
2456   // register.
2457   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2458   if (!MRI.getVRegDef(LiveIn)) {
2459     // FIXME: Should have scoped insert pt
2460     MachineBasicBlock &OrigInsBB = B.getMBB();
2461     auto OrigInsPt = B.getInsertPt();
2462 
2463     MachineBasicBlock &EntryMBB = B.getMF().front();
2464     EntryMBB.addLiveIn(PhyReg);
2465     B.setInsertPt(EntryMBB, EntryMBB.begin());
2466     B.buildCopy(LiveIn, PhyReg);
2467 
2468     B.setInsertPt(OrigInsBB, OrigInsPt);
2469   }
2470 
2471   return LiveIn;
2472 }
2473 
2474 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2475                                                 MachineRegisterInfo &MRI,
2476                                                 Register PhyReg, LLT Ty,
2477                                                 bool InsertLiveInCopy) const {
2478   assert(PhyReg.isPhysical() && "Physical register expected");
2479 
2480   // Get or create virtual live-in regester
2481   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2482   if (!LiveIn) {
2483     LiveIn = MRI.createGenericVirtualRegister(Ty);
2484     MRI.addLiveIn(PhyReg, LiveIn);
2485   }
2486 
2487   // When the actual true copy required is from virtual register to physical
2488   // register (to be inserted later), live-in copy insertion from physical
2489   // to register virtual register is not required
2490   if (!InsertLiveInCopy)
2491     return LiveIn;
2492 
2493   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2494 }
2495 
2496 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2497     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2498   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2499   const ArgDescriptor *Arg;
2500   const TargetRegisterClass *RC;
2501   LLT ArgTy;
2502   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2503   if (!Arg) {
2504     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2505     return nullptr;
2506   }
2507   return Arg;
2508 }
2509 
2510 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2511                                          const ArgDescriptor *Arg) const {
2512   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2513     return false; // TODO: Handle these
2514 
2515   Register SrcReg = Arg->getRegister();
2516   assert(SrcReg.isPhysical() && "Physical register expected");
2517   assert(DstReg.isVirtual() && "Virtual register expected");
2518 
2519   MachineRegisterInfo &MRI = *B.getMRI();
2520 
2521   LLT Ty = MRI.getType(DstReg);
2522   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2523 
2524   if (Arg->isMasked()) {
2525     // TODO: Should we try to emit this once in the entry block?
2526     const LLT S32 = LLT::scalar(32);
2527     const unsigned Mask = Arg->getMask();
2528     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2529 
2530     Register AndMaskSrc = LiveIn;
2531 
2532     if (Shift != 0) {
2533       auto ShiftAmt = B.buildConstant(S32, Shift);
2534       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2535     }
2536 
2537     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2538   } else {
2539     B.buildCopy(DstReg, LiveIn);
2540   }
2541 
2542   return true;
2543 }
2544 
2545 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2546     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2547     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2548 
2549   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2550   if (!Arg)
2551     return false;
2552 
2553   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2554     return false;
2555 
2556   MI.eraseFromParent();
2557   return true;
2558 }
2559 
2560 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2561                                        MachineRegisterInfo &MRI,
2562                                        MachineIRBuilder &B) const {
2563   Register Dst = MI.getOperand(0).getReg();
2564   LLT DstTy = MRI.getType(Dst);
2565   LLT S16 = LLT::scalar(16);
2566   LLT S32 = LLT::scalar(32);
2567   LLT S64 = LLT::scalar(64);
2568 
2569   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2570     return true;
2571 
2572   if (DstTy == S16)
2573     return legalizeFDIV16(MI, MRI, B);
2574   if (DstTy == S32)
2575     return legalizeFDIV32(MI, MRI, B);
2576   if (DstTy == S64)
2577     return legalizeFDIV64(MI, MRI, B);
2578 
2579   return false;
2580 }
2581 
2582 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2583                                                   Register DstReg,
2584                                                   Register X,
2585                                                   Register Y,
2586                                                   bool IsDiv) const {
2587   const LLT S1 = LLT::scalar(1);
2588   const LLT S32 = LLT::scalar(32);
2589 
2590   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2591   // algorithm used here.
2592 
2593   // Initial estimate of inv(y).
2594   auto FloatY = B.buildUITOFP(S32, Y);
2595   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2596   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2597   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2598   auto Z = B.buildFPTOUI(S32, ScaledY);
2599 
2600   // One round of UNR.
2601   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2602   auto NegYZ = B.buildMul(S32, NegY, Z);
2603   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2604 
2605   // Quotient/remainder estimate.
2606   auto Q = B.buildUMulH(S32, X, Z);
2607   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2608 
2609   // First quotient/remainder refinement.
2610   auto One = B.buildConstant(S32, 1);
2611   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2612   if (IsDiv)
2613     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2614   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2615 
2616   // Second quotient/remainder refinement.
2617   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2618   if (IsDiv)
2619     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2620   else
2621     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2622 }
2623 
2624 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2625                                               MachineRegisterInfo &MRI,
2626                                               MachineIRBuilder &B) const {
2627   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2628   Register DstReg = MI.getOperand(0).getReg();
2629   Register Num = MI.getOperand(1).getReg();
2630   Register Den = MI.getOperand(2).getReg();
2631   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2632   MI.eraseFromParent();
2633   return true;
2634 }
2635 
2636 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2637 //
2638 // Return lo, hi of result
2639 //
2640 // %cvt.lo = G_UITOFP Val.lo
2641 // %cvt.hi = G_UITOFP Val.hi
2642 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2643 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2644 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2645 // %mul2 = G_FMUL %mul1, 2**(-32)
2646 // %trunc = G_INTRINSIC_TRUNC %mul2
2647 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2648 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2649 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2650                                                        Register Val) {
2651   const LLT S32 = LLT::scalar(32);
2652   auto Unmerge = B.buildUnmerge(S32, Val);
2653 
2654   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2655   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2656 
2657   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2658                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2659 
2660   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2661   auto Mul1 =
2662       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2663 
2664   // 2**(-32)
2665   auto Mul2 =
2666       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2667   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2668 
2669   // -(2**32)
2670   auto Mad2 = B.buildFMAD(S32, Trunc,
2671                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2672 
2673   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2674   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2675 
2676   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2677 }
2678 
2679 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2680                                                   Register DstReg,
2681                                                   Register Numer,
2682                                                   Register Denom,
2683                                                   bool IsDiv) const {
2684   const LLT S32 = LLT::scalar(32);
2685   const LLT S64 = LLT::scalar(64);
2686   const LLT S1 = LLT::scalar(1);
2687   Register RcpLo, RcpHi;
2688 
2689   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2690 
2691   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2692 
2693   auto Zero64 = B.buildConstant(S64, 0);
2694   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2695 
2696   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2697   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2698 
2699   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2700   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2701   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2702 
2703   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2704   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2705   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2706   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2707 
2708   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2709   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2710   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2711   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2712   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2713 
2714   auto Zero32 = B.buildConstant(S32, 0);
2715   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2716   auto Add2_HiC =
2717       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2718   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2719   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2720 
2721   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2722   Register NumerLo = UnmergeNumer.getReg(0);
2723   Register NumerHi = UnmergeNumer.getReg(1);
2724 
2725   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2726   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2727   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2728   Register Mul3_Lo = UnmergeMul3.getReg(0);
2729   Register Mul3_Hi = UnmergeMul3.getReg(1);
2730   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2731   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2732   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2733   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2734 
2735   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2736   Register DenomLo = UnmergeDenom.getReg(0);
2737   Register DenomHi = UnmergeDenom.getReg(1);
2738 
2739   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2740   auto C1 = B.buildSExt(S32, CmpHi);
2741 
2742   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2743   auto C2 = B.buildSExt(S32, CmpLo);
2744 
2745   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2746   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2747 
2748   // TODO: Here and below portions of the code can be enclosed into if/endif.
2749   // Currently control flow is unconditional and we have 4 selects after
2750   // potential endif to substitute PHIs.
2751 
2752   // if C3 != 0 ...
2753   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2754   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2755   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2756   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2757 
2758   auto One64 = B.buildConstant(S64, 1);
2759   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2760 
2761   auto C4 =
2762       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2763   auto C5 =
2764       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2765   auto C6 = B.buildSelect(
2766       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2767 
2768   // if (C6 != 0)
2769   auto Add4 = B.buildAdd(S64, Add3, One64);
2770   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2771 
2772   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2773   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2774   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2775 
2776   // endif C6
2777   // endif C3
2778 
2779   if (IsDiv) {
2780     auto Sel1 = B.buildSelect(
2781         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2782     B.buildSelect(DstReg,
2783                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2784   } else {
2785     auto Sel2 = B.buildSelect(
2786         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2787     B.buildSelect(DstReg,
2788                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2789   }
2790 }
2791 
2792 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2793                                             MachineRegisterInfo &MRI,
2794                                             MachineIRBuilder &B) const {
2795   const LLT S64 = LLT::scalar(64);
2796   const LLT S32 = LLT::scalar(32);
2797   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2798   Register DstReg = MI.getOperand(0).getReg();
2799   Register Num = MI.getOperand(1).getReg();
2800   Register Den = MI.getOperand(2).getReg();
2801   LLT Ty = MRI.getType(DstReg);
2802 
2803   if (Ty == S32)
2804     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2805   else if (Ty == S64)
2806     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2807   else
2808     return false;
2809 
2810   MI.eraseFromParent();
2811   return true;
2812 
2813 }
2814 
2815 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2816                                             MachineRegisterInfo &MRI,
2817                                             MachineIRBuilder &B) const {
2818   const LLT S64 = LLT::scalar(64);
2819   const LLT S32 = LLT::scalar(32);
2820 
2821   Register DstReg = MI.getOperand(0).getReg();
2822   const LLT Ty = MRI.getType(DstReg);
2823   if (Ty != S32 && Ty != S64)
2824     return false;
2825 
2826   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2827 
2828   Register LHS = MI.getOperand(1).getReg();
2829   Register RHS = MI.getOperand(2).getReg();
2830 
2831   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2832   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2833   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2834 
2835   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2836   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2837 
2838   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2839   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2840 
2841   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2842   if (Ty == S32)
2843     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2844   else
2845     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2846 
2847   Register Sign;
2848   if (IsDiv)
2849     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2850   else
2851     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2852 
2853   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2854   B.buildSub(DstReg, UDivRem, Sign);
2855 
2856   MI.eraseFromParent();
2857   return true;
2858 }
2859 
2860 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2861                                                  MachineRegisterInfo &MRI,
2862                                                  MachineIRBuilder &B) const {
2863   Register Res = MI.getOperand(0).getReg();
2864   Register LHS = MI.getOperand(1).getReg();
2865   Register RHS = MI.getOperand(2).getReg();
2866 
2867   uint16_t Flags = MI.getFlags();
2868 
2869   LLT ResTy = MRI.getType(Res);
2870   LLT S32 = LLT::scalar(32);
2871   LLT S64 = LLT::scalar(64);
2872 
2873   const MachineFunction &MF = B.getMF();
2874   bool Unsafe =
2875     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2876 
2877   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2878     return false;
2879 
2880   if (!Unsafe && ResTy == S32 &&
2881       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2882     return false;
2883 
2884   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2885     // 1 / x -> RCP(x)
2886     if (CLHS->isExactlyValue(1.0)) {
2887       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2888         .addUse(RHS)
2889         .setMIFlags(Flags);
2890 
2891       MI.eraseFromParent();
2892       return true;
2893     }
2894 
2895     // -1 / x -> RCP( FNEG(x) )
2896     if (CLHS->isExactlyValue(-1.0)) {
2897       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2898       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2899         .addUse(FNeg.getReg(0))
2900         .setMIFlags(Flags);
2901 
2902       MI.eraseFromParent();
2903       return true;
2904     }
2905   }
2906 
2907   // x / y -> x * (1.0 / y)
2908   if (Unsafe) {
2909     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2910       .addUse(RHS)
2911       .setMIFlags(Flags);
2912     B.buildFMul(Res, LHS, RCP, Flags);
2913 
2914     MI.eraseFromParent();
2915     return true;
2916   }
2917 
2918   return false;
2919 }
2920 
2921 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2922                                          MachineRegisterInfo &MRI,
2923                                          MachineIRBuilder &B) const {
2924   Register Res = MI.getOperand(0).getReg();
2925   Register LHS = MI.getOperand(1).getReg();
2926   Register RHS = MI.getOperand(2).getReg();
2927 
2928   uint16_t Flags = MI.getFlags();
2929 
2930   LLT S16 = LLT::scalar(16);
2931   LLT S32 = LLT::scalar(32);
2932 
2933   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2934   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2935 
2936   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2937     .addUse(RHSExt.getReg(0))
2938     .setMIFlags(Flags);
2939 
2940   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2941   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2942 
2943   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2944     .addUse(RDst.getReg(0))
2945     .addUse(RHS)
2946     .addUse(LHS)
2947     .setMIFlags(Flags);
2948 
2949   MI.eraseFromParent();
2950   return true;
2951 }
2952 
2953 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2954 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2955 static void toggleSPDenormMode(bool Enable,
2956                                MachineIRBuilder &B,
2957                                const GCNSubtarget &ST,
2958                                AMDGPU::SIModeRegisterDefaults Mode) {
2959   // Set SP denorm mode to this value.
2960   unsigned SPDenormMode =
2961     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2962 
2963   if (ST.hasDenormModeInst()) {
2964     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2965     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2966 
2967     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2968     B.buildInstr(AMDGPU::S_DENORM_MODE)
2969       .addImm(NewDenormModeValue);
2970 
2971   } else {
2972     // Select FP32 bit field in mode register.
2973     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2974                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2975                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2976 
2977     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2978       .addImm(SPDenormMode)
2979       .addImm(SPDenormModeBitField);
2980   }
2981 }
2982 
2983 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2984                                          MachineRegisterInfo &MRI,
2985                                          MachineIRBuilder &B) const {
2986   Register Res = MI.getOperand(0).getReg();
2987   Register LHS = MI.getOperand(1).getReg();
2988   Register RHS = MI.getOperand(2).getReg();
2989   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2990   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2991 
2992   uint16_t Flags = MI.getFlags();
2993 
2994   LLT S32 = LLT::scalar(32);
2995   LLT S1 = LLT::scalar(1);
2996 
2997   auto One = B.buildFConstant(S32, 1.0f);
2998 
2999   auto DenominatorScaled =
3000     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3001       .addUse(LHS)
3002       .addUse(RHS)
3003       .addImm(0)
3004       .setMIFlags(Flags);
3005   auto NumeratorScaled =
3006     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3007       .addUse(LHS)
3008       .addUse(RHS)
3009       .addImm(1)
3010       .setMIFlags(Flags);
3011 
3012   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3013     .addUse(DenominatorScaled.getReg(0))
3014     .setMIFlags(Flags);
3015   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3016 
3017   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3018   // aren't modeled as reading it.
3019   if (!Mode.allFP32Denormals())
3020     toggleSPDenormMode(true, B, ST, Mode);
3021 
3022   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3023   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3024   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3025   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3026   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3027   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3028 
3029   if (!Mode.allFP32Denormals())
3030     toggleSPDenormMode(false, B, ST, Mode);
3031 
3032   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3033     .addUse(Fma4.getReg(0))
3034     .addUse(Fma1.getReg(0))
3035     .addUse(Fma3.getReg(0))
3036     .addUse(NumeratorScaled.getReg(1))
3037     .setMIFlags(Flags);
3038 
3039   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3040     .addUse(Fmas.getReg(0))
3041     .addUse(RHS)
3042     .addUse(LHS)
3043     .setMIFlags(Flags);
3044 
3045   MI.eraseFromParent();
3046   return true;
3047 }
3048 
3049 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3050                                          MachineRegisterInfo &MRI,
3051                                          MachineIRBuilder &B) const {
3052   Register Res = MI.getOperand(0).getReg();
3053   Register LHS = MI.getOperand(1).getReg();
3054   Register RHS = MI.getOperand(2).getReg();
3055 
3056   uint16_t Flags = MI.getFlags();
3057 
3058   LLT S64 = LLT::scalar(64);
3059   LLT S1 = LLT::scalar(1);
3060 
3061   auto One = B.buildFConstant(S64, 1.0);
3062 
3063   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3064     .addUse(LHS)
3065     .addUse(RHS)
3066     .addImm(0)
3067     .setMIFlags(Flags);
3068 
3069   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3070 
3071   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3072     .addUse(DivScale0.getReg(0))
3073     .setMIFlags(Flags);
3074 
3075   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3076   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3077   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3078 
3079   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3080     .addUse(LHS)
3081     .addUse(RHS)
3082     .addImm(1)
3083     .setMIFlags(Flags);
3084 
3085   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3086   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3087   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3088 
3089   Register Scale;
3090   if (!ST.hasUsableDivScaleConditionOutput()) {
3091     // Workaround a hardware bug on SI where the condition output from div_scale
3092     // is not usable.
3093 
3094     LLT S32 = LLT::scalar(32);
3095 
3096     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3097     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3098     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3099     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3100 
3101     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3102                               Scale1Unmerge.getReg(1));
3103     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3104                               Scale0Unmerge.getReg(1));
3105     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3106   } else {
3107     Scale = DivScale1.getReg(1);
3108   }
3109 
3110   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3111     .addUse(Fma4.getReg(0))
3112     .addUse(Fma3.getReg(0))
3113     .addUse(Mul.getReg(0))
3114     .addUse(Scale)
3115     .setMIFlags(Flags);
3116 
3117   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3118     .addUse(Fmas.getReg(0))
3119     .addUse(RHS)
3120     .addUse(LHS)
3121     .setMIFlags(Flags);
3122 
3123   MI.eraseFromParent();
3124   return true;
3125 }
3126 
3127 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3128                                                  MachineRegisterInfo &MRI,
3129                                                  MachineIRBuilder &B) const {
3130   Register Res = MI.getOperand(0).getReg();
3131   Register LHS = MI.getOperand(2).getReg();
3132   Register RHS = MI.getOperand(3).getReg();
3133   uint16_t Flags = MI.getFlags();
3134 
3135   LLT S32 = LLT::scalar(32);
3136   LLT S1 = LLT::scalar(1);
3137 
3138   auto Abs = B.buildFAbs(S32, RHS, Flags);
3139   const APFloat C0Val(1.0f);
3140 
3141   auto C0 = B.buildConstant(S32, 0x6f800000);
3142   auto C1 = B.buildConstant(S32, 0x2f800000);
3143   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3144 
3145   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3146   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3147 
3148   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3149 
3150   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3151     .addUse(Mul0.getReg(0))
3152     .setMIFlags(Flags);
3153 
3154   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3155 
3156   B.buildFMul(Res, Sel, Mul1, Flags);
3157 
3158   MI.eraseFromParent();
3159   return true;
3160 }
3161 
3162 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3163                                             MachineRegisterInfo &MRI,
3164                                             MachineIRBuilder &B) const {
3165   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3166   uint64_t Offset =
3167     ST.getTargetLowering()->getImplicitParameterOffset(
3168       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3169   LLT DstTy = MRI.getType(DstReg);
3170   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3171 
3172   const ArgDescriptor *Arg;
3173   const TargetRegisterClass *RC;
3174   LLT ArgTy;
3175   std::tie(Arg, RC, ArgTy) =
3176       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3177   if (!Arg)
3178     return false;
3179 
3180   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3181   if (!loadInputValue(KernargPtrReg, B, Arg))
3182     return false;
3183 
3184   // FIXME: This should be nuw
3185   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3186   return true;
3187 }
3188 
3189 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3190                                                  MachineRegisterInfo &MRI,
3191                                                  MachineIRBuilder &B) const {
3192   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3193   if (!MFI->isEntryFunction()) {
3194     return legalizePreloadedArgIntrin(MI, MRI, B,
3195                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3196   }
3197 
3198   Register DstReg = MI.getOperand(0).getReg();
3199   if (!getImplicitArgPtr(DstReg, MRI, B))
3200     return false;
3201 
3202   MI.eraseFromParent();
3203   return true;
3204 }
3205 
3206 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3207                                               MachineRegisterInfo &MRI,
3208                                               MachineIRBuilder &B,
3209                                               unsigned AddrSpace) const {
3210   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3211   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3212   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3213   MI.eraseFromParent();
3214   return true;
3215 }
3216 
3217 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3218 // offset (the offset that is included in bounds checking and swizzling, to be
3219 // split between the instruction's voffset and immoffset fields) and soffset
3220 // (the offset that is excluded from bounds checking and swizzling, to go in
3221 // the instruction's soffset field).  This function takes the first kind of
3222 // offset and figures out how to split it between voffset and immoffset.
3223 std::tuple<Register, unsigned, unsigned>
3224 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3225                                         Register OrigOffset) const {
3226   const unsigned MaxImm = 4095;
3227   Register BaseReg;
3228   unsigned TotalConstOffset;
3229   MachineInstr *OffsetDef;
3230   const LLT S32 = LLT::scalar(32);
3231 
3232   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3233     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3234 
3235   unsigned ImmOffset = TotalConstOffset;
3236 
3237   // If the immediate value is too big for the immoffset field, put the value
3238   // and -4096 into the immoffset field so that the value that is copied/added
3239   // for the voffset field is a multiple of 4096, and it stands more chance
3240   // of being CSEd with the copy/add for another similar load/store.
3241   // However, do not do that rounding down to a multiple of 4096 if that is a
3242   // negative number, as it appears to be illegal to have a negative offset
3243   // in the vgpr, even if adding the immediate offset makes it positive.
3244   unsigned Overflow = ImmOffset & ~MaxImm;
3245   ImmOffset -= Overflow;
3246   if ((int32_t)Overflow < 0) {
3247     Overflow += ImmOffset;
3248     ImmOffset = 0;
3249   }
3250 
3251   if (Overflow != 0) {
3252     if (!BaseReg) {
3253       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3254     } else {
3255       auto OverflowVal = B.buildConstant(S32, Overflow);
3256       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3257     }
3258   }
3259 
3260   if (!BaseReg)
3261     BaseReg = B.buildConstant(S32, 0).getReg(0);
3262 
3263   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3264 }
3265 
3266 /// Handle register layout difference for f16 images for some subtargets.
3267 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3268                                              MachineRegisterInfo &MRI,
3269                                              Register Reg) const {
3270   if (!ST.hasUnpackedD16VMem())
3271     return Reg;
3272 
3273   const LLT S16 = LLT::scalar(16);
3274   const LLT S32 = LLT::scalar(32);
3275   LLT StoreVT = MRI.getType(Reg);
3276   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3277 
3278   auto Unmerge = B.buildUnmerge(S16, Reg);
3279 
3280   SmallVector<Register, 4> WideRegs;
3281   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3282     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3283 
3284   int NumElts = StoreVT.getNumElements();
3285 
3286   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3287 }
3288 
3289 Register AMDGPULegalizerInfo::fixStoreSourceType(
3290   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3291   MachineRegisterInfo *MRI = B.getMRI();
3292   LLT Ty = MRI->getType(VData);
3293 
3294   const LLT S16 = LLT::scalar(16);
3295 
3296   // Fixup illegal register types for i8 stores.
3297   if (Ty == LLT::scalar(8) || Ty == S16) {
3298     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3299     return AnyExt;
3300   }
3301 
3302   if (Ty.isVector()) {
3303     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3304       if (IsFormat)
3305         return handleD16VData(B, *MRI, VData);
3306     }
3307   }
3308 
3309   return VData;
3310 }
3311 
3312 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3313                                               MachineRegisterInfo &MRI,
3314                                               MachineIRBuilder &B,
3315                                               bool IsTyped,
3316                                               bool IsFormat) const {
3317   Register VData = MI.getOperand(1).getReg();
3318   LLT Ty = MRI.getType(VData);
3319   LLT EltTy = Ty.getScalarType();
3320   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3321   const LLT S32 = LLT::scalar(32);
3322 
3323   VData = fixStoreSourceType(B, VData, IsFormat);
3324   Register RSrc = MI.getOperand(2).getReg();
3325 
3326   MachineMemOperand *MMO = *MI.memoperands_begin();
3327   const int MemSize = MMO->getSize();
3328 
3329   unsigned ImmOffset;
3330   unsigned TotalOffset;
3331 
3332   // The typed intrinsics add an immediate after the registers.
3333   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3334 
3335   // The struct intrinsic variants add one additional operand over raw.
3336   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3337   Register VIndex;
3338   int OpOffset = 0;
3339   if (HasVIndex) {
3340     VIndex = MI.getOperand(3).getReg();
3341     OpOffset = 1;
3342   }
3343 
3344   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3345   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3346 
3347   unsigned Format = 0;
3348   if (IsTyped) {
3349     Format = MI.getOperand(5 + OpOffset).getImm();
3350     ++OpOffset;
3351   }
3352 
3353   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3354 
3355   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3356   if (TotalOffset != 0)
3357     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3358 
3359   unsigned Opc;
3360   if (IsTyped) {
3361     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3362                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3363   } else if (IsFormat) {
3364     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3365                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3366   } else {
3367     switch (MemSize) {
3368     case 1:
3369       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3370       break;
3371     case 2:
3372       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3373       break;
3374     default:
3375       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3376       break;
3377     }
3378   }
3379 
3380   if (!VIndex)
3381     VIndex = B.buildConstant(S32, 0).getReg(0);
3382 
3383   auto MIB = B.buildInstr(Opc)
3384     .addUse(VData)              // vdata
3385     .addUse(RSrc)               // rsrc
3386     .addUse(VIndex)             // vindex
3387     .addUse(VOffset)            // voffset
3388     .addUse(SOffset)            // soffset
3389     .addImm(ImmOffset);         // offset(imm)
3390 
3391   if (IsTyped)
3392     MIB.addImm(Format);
3393 
3394   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3395      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3396      .addMemOperand(MMO);
3397 
3398   MI.eraseFromParent();
3399   return true;
3400 }
3401 
3402 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3403                                              MachineRegisterInfo &MRI,
3404                                              MachineIRBuilder &B,
3405                                              bool IsFormat,
3406                                              bool IsTyped) const {
3407   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3408   MachineMemOperand *MMO = *MI.memoperands_begin();
3409   const int MemSize = MMO->getSize();
3410   const LLT S32 = LLT::scalar(32);
3411 
3412   Register Dst = MI.getOperand(0).getReg();
3413   Register RSrc = MI.getOperand(2).getReg();
3414 
3415   // The typed intrinsics add an immediate after the registers.
3416   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3417 
3418   // The struct intrinsic variants add one additional operand over raw.
3419   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3420   Register VIndex;
3421   int OpOffset = 0;
3422   if (HasVIndex) {
3423     VIndex = MI.getOperand(3).getReg();
3424     OpOffset = 1;
3425   }
3426 
3427   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3428   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3429 
3430   unsigned Format = 0;
3431   if (IsTyped) {
3432     Format = MI.getOperand(5 + OpOffset).getImm();
3433     ++OpOffset;
3434   }
3435 
3436   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3437   unsigned ImmOffset;
3438   unsigned TotalOffset;
3439 
3440   LLT Ty = MRI.getType(Dst);
3441   LLT EltTy = Ty.getScalarType();
3442   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3443   const bool Unpacked = ST.hasUnpackedD16VMem();
3444 
3445   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3446   if (TotalOffset != 0)
3447     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3448 
3449   unsigned Opc;
3450 
3451   if (IsTyped) {
3452     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3453                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3454   } else if (IsFormat) {
3455     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3456                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3457   } else {
3458     switch (MemSize) {
3459     case 1:
3460       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3461       break;
3462     case 2:
3463       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3464       break;
3465     default:
3466       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3467       break;
3468     }
3469   }
3470 
3471   Register LoadDstReg;
3472 
3473   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3474   LLT UnpackedTy = Ty.changeElementSize(32);
3475 
3476   if (IsExtLoad)
3477     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3478   else if (Unpacked && IsD16 && Ty.isVector())
3479     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3480   else
3481     LoadDstReg = Dst;
3482 
3483   if (!VIndex)
3484     VIndex = B.buildConstant(S32, 0).getReg(0);
3485 
3486   auto MIB = B.buildInstr(Opc)
3487     .addDef(LoadDstReg)         // vdata
3488     .addUse(RSrc)               // rsrc
3489     .addUse(VIndex)             // vindex
3490     .addUse(VOffset)            // voffset
3491     .addUse(SOffset)            // soffset
3492     .addImm(ImmOffset);         // offset(imm)
3493 
3494   if (IsTyped)
3495     MIB.addImm(Format);
3496 
3497   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3498      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3499      .addMemOperand(MMO);
3500 
3501   if (LoadDstReg != Dst) {
3502     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3503 
3504     // Widen result for extending loads was widened.
3505     if (IsExtLoad)
3506       B.buildTrunc(Dst, LoadDstReg);
3507     else {
3508       // Repack to original 16-bit vector result
3509       // FIXME: G_TRUNC should work, but legalization currently fails
3510       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3511       SmallVector<Register, 4> Repack;
3512       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3513         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3514       B.buildMerge(Dst, Repack);
3515     }
3516   }
3517 
3518   MI.eraseFromParent();
3519   return true;
3520 }
3521 
3522 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3523                                                MachineIRBuilder &B,
3524                                                bool IsInc) const {
3525   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3526                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3527   B.buildInstr(Opc)
3528     .addDef(MI.getOperand(0).getReg())
3529     .addUse(MI.getOperand(2).getReg())
3530     .addUse(MI.getOperand(3).getReg())
3531     .cloneMemRefs(MI);
3532   MI.eraseFromParent();
3533   return true;
3534 }
3535 
3536 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3537   switch (IntrID) {
3538   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3539   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3540     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3541   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3542   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3543     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3544   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3545   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3546     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3547   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3548   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3549     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3550   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3551   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3552     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3553   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3554   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3555     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3556   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3557   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3558     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3559   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3560   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3561     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3562   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3563   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3564     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3565   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3566   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3567     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3568   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3569   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3570     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3571   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3572   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3573     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3574   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3575   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3576     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3577   default:
3578     llvm_unreachable("unhandled atomic opcode");
3579   }
3580 }
3581 
3582 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3583                                                MachineIRBuilder &B,
3584                                                Intrinsic::ID IID) const {
3585   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3586                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3587 
3588   Register Dst = MI.getOperand(0).getReg();
3589   Register VData = MI.getOperand(2).getReg();
3590 
3591   Register CmpVal;
3592   int OpOffset = 0;
3593 
3594   if (IsCmpSwap) {
3595     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3596     ++OpOffset;
3597   }
3598 
3599   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3600   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3601 
3602   // The struct intrinsic variants add one additional operand over raw.
3603   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3604   Register VIndex;
3605   if (HasVIndex) {
3606     VIndex = MI.getOperand(4 + OpOffset).getReg();
3607     ++OpOffset;
3608   }
3609 
3610   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3611   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3612   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3613 
3614   MachineMemOperand *MMO = *MI.memoperands_begin();
3615 
3616   unsigned ImmOffset;
3617   unsigned TotalOffset;
3618   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3619   if (TotalOffset != 0)
3620     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3621 
3622   if (!VIndex)
3623     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3624 
3625   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3626     .addDef(Dst)
3627     .addUse(VData); // vdata
3628 
3629   if (IsCmpSwap)
3630     MIB.addReg(CmpVal);
3631 
3632   MIB.addUse(RSrc)               // rsrc
3633      .addUse(VIndex)             // vindex
3634      .addUse(VOffset)            // voffset
3635      .addUse(SOffset)            // soffset
3636      .addImm(ImmOffset)          // offset(imm)
3637      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3638      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3639      .addMemOperand(MMO);
3640 
3641   MI.eraseFromParent();
3642   return true;
3643 }
3644 
3645 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3646 /// vector with s16 typed elements.
3647 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3648                                         SmallVectorImpl<Register> &PackedAddrs,
3649                                         int AddrIdx, int DimIdx, int EndIdx,
3650                                         int NumGradients) {
3651   const LLT S16 = LLT::scalar(16);
3652   const LLT V2S16 = LLT::vector(2, 16);
3653 
3654   for (int I = AddrIdx; I < EndIdx; ++I) {
3655     MachineOperand &SrcOp = MI.getOperand(I);
3656     if (!SrcOp.isReg())
3657       continue; // _L to _LZ may have eliminated this.
3658 
3659     Register AddrReg = SrcOp.getReg();
3660 
3661     if (I < DimIdx) {
3662       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3663       PackedAddrs.push_back(AddrReg);
3664     } else {
3665       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3666       // derivatives dx/dh and dx/dv are packed with undef.
3667       if (((I + 1) >= EndIdx) ||
3668           ((NumGradients / 2) % 2 == 1 &&
3669            (I == DimIdx + (NumGradients / 2) - 1 ||
3670             I == DimIdx + NumGradients - 1)) ||
3671           // Check for _L to _LZ optimization
3672           !MI.getOperand(I + 1).isReg()) {
3673         PackedAddrs.push_back(
3674             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3675                 .getReg(0));
3676       } else {
3677         PackedAddrs.push_back(
3678             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3679                 .getReg(0));
3680         ++I;
3681       }
3682     }
3683   }
3684 }
3685 
3686 /// Convert from separate vaddr components to a single vector address register,
3687 /// and replace the remaining operands with $noreg.
3688 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3689                                      int DimIdx, int NumVAddrs) {
3690   const LLT S32 = LLT::scalar(32);
3691 
3692   SmallVector<Register, 8> AddrRegs;
3693   for (int I = 0; I != NumVAddrs; ++I) {
3694     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3695     if (SrcOp.isReg()) {
3696       AddrRegs.push_back(SrcOp.getReg());
3697       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3698     }
3699   }
3700 
3701   int NumAddrRegs = AddrRegs.size();
3702   if (NumAddrRegs != 1) {
3703     // Round up to 8 elements for v5-v7
3704     // FIXME: Missing intermediate sized register classes and instructions.
3705     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3706       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3707       auto Undef = B.buildUndef(S32);
3708       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3709       NumAddrRegs = RoundedNumRegs;
3710     }
3711 
3712     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3713     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3714   }
3715 
3716   for (int I = 1; I != NumVAddrs; ++I) {
3717     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3718     if (SrcOp.isReg())
3719       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3720   }
3721 }
3722 
3723 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3724 ///
3725 /// Depending on the subtarget, load/store with 16-bit element data need to be
3726 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3727 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3728 /// registers.
3729 ///
3730 /// We don't want to directly select image instructions just yet, but also want
3731 /// to exposes all register repacking to the legalizer/combiners. We also don't
3732 /// want a selected instrution entering RegBankSelect. In order to avoid
3733 /// defining a multitude of intermediate image instructions, directly hack on
3734 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3735 /// now unnecessary arguments with $noreg.
3736 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3737     MachineInstr &MI, MachineIRBuilder &B,
3738     GISelChangeObserver &Observer,
3739     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3740 
3741   const int NumDefs = MI.getNumExplicitDefs();
3742   bool IsTFE = NumDefs == 2;
3743   // We are only processing the operands of d16 image operations on subtargets
3744   // that use the unpacked register layout, or need to repack the TFE result.
3745 
3746   // TODO: Do we need to guard against already legalized intrinsics?
3747   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3748     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3749 
3750   MachineRegisterInfo *MRI = B.getMRI();
3751   const LLT S32 = LLT::scalar(32);
3752   const LLT S16 = LLT::scalar(16);
3753   const LLT V2S16 = LLT::vector(2, 16);
3754 
3755   // Index of first address argument
3756   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3757 
3758   int NumVAddrs, NumGradients;
3759   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3760   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3761     getDMaskIdx(BaseOpcode, NumDefs);
3762   unsigned DMask = 0;
3763 
3764   // Check for 16 bit addresses and pack if true.
3765   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3766   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3767   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3768   const bool IsG16 = GradTy == S16;
3769   const bool IsA16 = AddrTy == S16;
3770 
3771   int DMaskLanes = 0;
3772   if (!BaseOpcode->Atomic) {
3773     DMask = MI.getOperand(DMaskIdx).getImm();
3774     if (BaseOpcode->Gather4) {
3775       DMaskLanes = 4;
3776     } else if (DMask != 0) {
3777       DMaskLanes = countPopulation(DMask);
3778     } else if (!IsTFE && !BaseOpcode->Store) {
3779       // If dmask is 0, this is a no-op load. This can be eliminated.
3780       B.buildUndef(MI.getOperand(0));
3781       MI.eraseFromParent();
3782       return true;
3783     }
3784   }
3785 
3786   Observer.changingInstr(MI);
3787   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3788 
3789   unsigned NewOpcode = NumDefs == 0 ?
3790     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3791 
3792   // Track that we legalized this
3793   MI.setDesc(B.getTII().get(NewOpcode));
3794 
3795   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3796   // dmask to be at least 1 otherwise the instruction will fail
3797   if (IsTFE && DMask == 0) {
3798     DMask = 0x1;
3799     DMaskLanes = 1;
3800     MI.getOperand(DMaskIdx).setImm(DMask);
3801   }
3802 
3803   if (BaseOpcode->Atomic) {
3804     Register VData0 = MI.getOperand(2).getReg();
3805     LLT Ty = MRI->getType(VData0);
3806 
3807     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3808     if (Ty.isVector())
3809       return false;
3810 
3811     if (BaseOpcode->AtomicX2) {
3812       Register VData1 = MI.getOperand(3).getReg();
3813       // The two values are packed in one register.
3814       LLT PackedTy = LLT::vector(2, Ty);
3815       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3816       MI.getOperand(2).setReg(Concat.getReg(0));
3817       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3818     }
3819   }
3820 
3821   int CorrectedNumVAddrs = NumVAddrs;
3822 
3823   // Optimize _L to _LZ when _L is zero
3824   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3825         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3826     const ConstantFP *ConstantLod;
3827     const int LodIdx = AddrIdx + NumVAddrs - 1;
3828 
3829     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3830       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3831         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3832         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3833           LZMappingInfo->LZ, ImageDimIntr->Dim);
3834 
3835         // The starting indexes should remain in the same place.
3836         --NumVAddrs;
3837         --CorrectedNumVAddrs;
3838 
3839         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3840           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3841         MI.RemoveOperand(LodIdx);
3842       }
3843     }
3844   }
3845 
3846   // Optimize _mip away, when 'lod' is zero
3847   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3848     int64_t ConstantLod;
3849     const int LodIdx = AddrIdx + NumVAddrs - 1;
3850 
3851     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3852       if (ConstantLod == 0) {
3853         // TODO: Change intrinsic opcode and remove operand instead or replacing
3854         // it with 0, as the _L to _LZ handling is done above.
3855         MI.getOperand(LodIdx).ChangeToImmediate(0);
3856         --CorrectedNumVAddrs;
3857       }
3858     }
3859   }
3860 
3861   // Rewrite the addressing register layout before doing anything else.
3862   if (IsA16 || IsG16) {
3863     if (IsA16) {
3864       // Target must support the feature and gradients need to be 16 bit too
3865       if (!ST.hasA16() || !IsG16)
3866         return false;
3867     } else if (!ST.hasG16())
3868       return false;
3869 
3870     if (NumVAddrs > 1) {
3871       SmallVector<Register, 4> PackedRegs;
3872       // Don't compress addresses for G16
3873       const int PackEndIdx =
3874           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3875       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3876                                   PackEndIdx, NumGradients);
3877 
3878       if (!IsA16) {
3879         // Add uncompressed address
3880         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3881           int AddrReg = MI.getOperand(I).getReg();
3882           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3883           PackedRegs.push_back(AddrReg);
3884         }
3885       }
3886 
3887       // See also below in the non-a16 branch
3888       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3889 
3890       if (!UseNSA && PackedRegs.size() > 1) {
3891         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3892         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3893         PackedRegs[0] = Concat.getReg(0);
3894         PackedRegs.resize(1);
3895       }
3896 
3897       const int NumPacked = PackedRegs.size();
3898       for (int I = 0; I != NumVAddrs; ++I) {
3899         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3900         if (!SrcOp.isReg()) {
3901           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3902           continue;
3903         }
3904 
3905         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3906 
3907         if (I < NumPacked)
3908           SrcOp.setReg(PackedRegs[I]);
3909         else
3910           SrcOp.setReg(AMDGPU::NoRegister);
3911       }
3912     }
3913   } else {
3914     // If the register allocator cannot place the address registers contiguously
3915     // without introducing moves, then using the non-sequential address encoding
3916     // is always preferable, since it saves VALU instructions and is usually a
3917     // wash in terms of code size or even better.
3918     //
3919     // However, we currently have no way of hinting to the register allocator
3920     // that MIMG addresses should be placed contiguously when it is possible to
3921     // do so, so force non-NSA for the common 2-address case as a heuristic.
3922     //
3923     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3924     // allocation when possible.
3925     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3926 
3927     if (!UseNSA && NumVAddrs > 1)
3928       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3929   }
3930 
3931   int Flags = 0;
3932   if (IsA16)
3933     Flags |= 1;
3934   if (IsG16)
3935     Flags |= 2;
3936   MI.addOperand(MachineOperand::CreateImm(Flags));
3937 
3938   if (BaseOpcode->Store) { // No TFE for stores?
3939     // TODO: Handle dmask trim
3940     Register VData = MI.getOperand(1).getReg();
3941     LLT Ty = MRI->getType(VData);
3942     if (!Ty.isVector() || Ty.getElementType() != S16)
3943       return true;
3944 
3945     Register RepackedReg = handleD16VData(B, *MRI, VData);
3946     if (RepackedReg != VData) {
3947       MI.getOperand(1).setReg(RepackedReg);
3948     }
3949 
3950     return true;
3951   }
3952 
3953   Register DstReg = MI.getOperand(0).getReg();
3954   LLT Ty = MRI->getType(DstReg);
3955   const LLT EltTy = Ty.getScalarType();
3956   const bool IsD16 = Ty.getScalarType() == S16;
3957   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3958 
3959   // Confirm that the return type is large enough for the dmask specified
3960   if (NumElts < DMaskLanes)
3961     return false;
3962 
3963   if (NumElts > 4 || DMaskLanes > 4)
3964     return false;
3965 
3966   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3967   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3968 
3969   // The raw dword aligned data component of the load. The only legal cases
3970   // where this matters should be when using the packed D16 format, for
3971   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3972   LLT RoundedTy;
3973 
3974   // S32 vector to to cover all data, plus TFE result element.
3975   LLT TFETy;
3976 
3977   // Register type to use for each loaded component. Will be S32 or V2S16.
3978   LLT RegTy;
3979 
3980   if (IsD16 && ST.hasUnpackedD16VMem()) {
3981     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3982     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3983     RegTy = S32;
3984   } else {
3985     unsigned EltSize = EltTy.getSizeInBits();
3986     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3987     unsigned RoundedSize = 32 * RoundedElts;
3988     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3989     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3990     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3991   }
3992 
3993   // The return type does not need adjustment.
3994   // TODO: Should we change s16 case to s32 or <2 x s16>?
3995   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3996     return true;
3997 
3998   Register Dst1Reg;
3999 
4000   // Insert after the instruction.
4001   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4002 
4003   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4004   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4005   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4006   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4007 
4008   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4009 
4010   MI.getOperand(0).setReg(NewResultReg);
4011 
4012   // In the IR, TFE is supposed to be used with a 2 element struct return
4013   // type. The intruction really returns these two values in one contiguous
4014   // register, with one additional dword beyond the loaded data. Rewrite the
4015   // return type to use a single register result.
4016 
4017   if (IsTFE) {
4018     Dst1Reg = MI.getOperand(1).getReg();
4019     if (MRI->getType(Dst1Reg) != S32)
4020       return false;
4021 
4022     // TODO: Make sure the TFE operand bit is set.
4023     MI.RemoveOperand(1);
4024 
4025     // Handle the easy case that requires no repack instructions.
4026     if (Ty == S32) {
4027       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4028       return true;
4029     }
4030   }
4031 
4032   // Now figure out how to copy the new result register back into the old
4033   // result.
4034   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4035 
4036   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4037 
4038   if (ResultNumRegs == 1) {
4039     assert(!IsTFE);
4040     ResultRegs[0] = NewResultReg;
4041   } else {
4042     // We have to repack into a new vector of some kind.
4043     for (int I = 0; I != NumDataRegs; ++I)
4044       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4045     B.buildUnmerge(ResultRegs, NewResultReg);
4046 
4047     // Drop the final TFE element to get the data part. The TFE result is
4048     // directly written to the right place already.
4049     if (IsTFE)
4050       ResultRegs.resize(NumDataRegs);
4051   }
4052 
4053   // For an s16 scalar result, we form an s32 result with a truncate regardless
4054   // of packed vs. unpacked.
4055   if (IsD16 && !Ty.isVector()) {
4056     B.buildTrunc(DstReg, ResultRegs[0]);
4057     return true;
4058   }
4059 
4060   // Avoid a build/concat_vector of 1 entry.
4061   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4062     B.buildBitcast(DstReg, ResultRegs[0]);
4063     return true;
4064   }
4065 
4066   assert(Ty.isVector());
4067 
4068   if (IsD16) {
4069     // For packed D16 results with TFE enabled, all the data components are
4070     // S32. Cast back to the expected type.
4071     //
4072     // TODO: We don't really need to use load s32 elements. We would only need one
4073     // cast for the TFE result if a multiple of v2s16 was used.
4074     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4075       for (Register &Reg : ResultRegs)
4076         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4077     } else if (ST.hasUnpackedD16VMem()) {
4078       for (Register &Reg : ResultRegs)
4079         Reg = B.buildTrunc(S16, Reg).getReg(0);
4080     }
4081   }
4082 
4083   auto padWithUndef = [&](LLT Ty, int NumElts) {
4084     if (NumElts == 0)
4085       return;
4086     Register Undef = B.buildUndef(Ty).getReg(0);
4087     for (int I = 0; I != NumElts; ++I)
4088       ResultRegs.push_back(Undef);
4089   };
4090 
4091   // Pad out any elements eliminated due to the dmask.
4092   LLT ResTy = MRI->getType(ResultRegs[0]);
4093   if (!ResTy.isVector()) {
4094     padWithUndef(ResTy, NumElts - ResultRegs.size());
4095     B.buildBuildVector(DstReg, ResultRegs);
4096     return true;
4097   }
4098 
4099   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4100   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4101 
4102   // Deal with the one annoying legal case.
4103   const LLT V3S16 = LLT::vector(3, 16);
4104   if (Ty == V3S16) {
4105     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4106     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4107     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4108     return true;
4109   }
4110 
4111   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4112   B.buildConcatVectors(DstReg, ResultRegs);
4113   return true;
4114 }
4115 
4116 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4117   MachineInstr &MI, MachineIRBuilder &B,
4118   GISelChangeObserver &Observer) const {
4119   Register Dst = MI.getOperand(0).getReg();
4120   LLT Ty = B.getMRI()->getType(Dst);
4121   unsigned Size = Ty.getSizeInBits();
4122   MachineFunction &MF = B.getMF();
4123 
4124   Observer.changingInstr(MI);
4125 
4126   // FIXME: We don't really need this intermediate instruction. The intrinsic
4127   // should be fixed to have a memory operand. Since it's readnone, we're not
4128   // allowed to add one.
4129   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4130   MI.RemoveOperand(1); // Remove intrinsic ID
4131 
4132   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4133   // TODO: Should this use datalayout alignment?
4134   const unsigned MemSize = (Size + 7) / 8;
4135   const Align MemAlign(4);
4136   MachineMemOperand *MMO = MF.getMachineMemOperand(
4137       MachinePointerInfo(),
4138       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4139           MachineMemOperand::MOInvariant,
4140       MemSize, MemAlign);
4141   MI.addMemOperand(MF, MMO);
4142 
4143   // There are no 96-bit result scalar loads, but widening to 128-bit should
4144   // always be legal. We may need to restore this to a 96-bit result if it turns
4145   // out this needs to be converted to a vector load during RegBankSelect.
4146   if (!isPowerOf2_32(Size)) {
4147     LegalizerHelper Helper(MF, *this, Observer, B);
4148 
4149     if (Ty.isVector())
4150       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4151     else
4152       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4153   }
4154 
4155   Observer.changedInstr(MI);
4156   return true;
4157 }
4158 
4159 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4160                                                 MachineRegisterInfo &MRI,
4161                                                 MachineIRBuilder &B) const {
4162   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4163   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4164       !ST.isTrapHandlerEnabled()) {
4165     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4166   } else {
4167     // Pass queue pointer to trap handler as input, and insert trap instruction
4168     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4169     const ArgDescriptor *Arg =
4170         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4171     if (!Arg)
4172       return false;
4173     MachineRegisterInfo &MRI = *B.getMRI();
4174     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4175     Register LiveIn = getLiveInRegister(
4176         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4177         /*InsertLiveInCopy=*/false);
4178     if (!loadInputValue(LiveIn, B, Arg))
4179       return false;
4180     B.buildCopy(SGPR01, LiveIn);
4181     B.buildInstr(AMDGPU::S_TRAP)
4182         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4183         .addReg(SGPR01, RegState::Implicit);
4184   }
4185 
4186   MI.eraseFromParent();
4187   return true;
4188 }
4189 
4190 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4191     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4192   // Is non-HSA path or trap-handler disabled? then, report a warning
4193   // accordingly
4194   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4195       !ST.isTrapHandlerEnabled()) {
4196     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4197                                      "debugtrap handler not supported",
4198                                      MI.getDebugLoc(), DS_Warning);
4199     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4200     Ctx.diagnose(NoTrap);
4201   } else {
4202     // Insert debug-trap instruction
4203     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4204   }
4205 
4206   MI.eraseFromParent();
4207   return true;
4208 }
4209 
4210 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4211                                             MachineInstr &MI) const {
4212   MachineIRBuilder &B = Helper.MIRBuilder;
4213   MachineRegisterInfo &MRI = *B.getMRI();
4214 
4215   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4216   auto IntrID = MI.getIntrinsicID();
4217   switch (IntrID) {
4218   case Intrinsic::amdgcn_if:
4219   case Intrinsic::amdgcn_else: {
4220     MachineInstr *Br = nullptr;
4221     MachineBasicBlock *UncondBrTarget = nullptr;
4222     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4223       const SIRegisterInfo *TRI
4224         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4225 
4226       Register Def = MI.getOperand(1).getReg();
4227       Register Use = MI.getOperand(3).getReg();
4228 
4229       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4230       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4231       if (IntrID == Intrinsic::amdgcn_if) {
4232         B.buildInstr(AMDGPU::SI_IF)
4233           .addDef(Def)
4234           .addUse(Use)
4235           .addMBB(UncondBrTarget);
4236       } else {
4237         B.buildInstr(AMDGPU::SI_ELSE)
4238           .addDef(Def)
4239           .addUse(Use)
4240           .addMBB(UncondBrTarget)
4241           .addImm(0);
4242       }
4243 
4244       if (Br) {
4245         Br->getOperand(0).setMBB(CondBrTarget);
4246       } else {
4247         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4248         // since we're swapping branch targets it needs to be reinserted.
4249         // FIXME: IRTranslator should probably not do this
4250         B.buildBr(*CondBrTarget);
4251       }
4252 
4253       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4254       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4255       MI.eraseFromParent();
4256       BrCond->eraseFromParent();
4257       return true;
4258     }
4259 
4260     return false;
4261   }
4262   case Intrinsic::amdgcn_loop: {
4263     MachineInstr *Br = nullptr;
4264     MachineBasicBlock *UncondBrTarget = nullptr;
4265     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4266       const SIRegisterInfo *TRI
4267         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4268 
4269       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4270       Register Reg = MI.getOperand(2).getReg();
4271 
4272       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4273       B.buildInstr(AMDGPU::SI_LOOP)
4274         .addUse(Reg)
4275         .addMBB(UncondBrTarget);
4276 
4277       if (Br)
4278         Br->getOperand(0).setMBB(CondBrTarget);
4279       else
4280         B.buildBr(*CondBrTarget);
4281 
4282       MI.eraseFromParent();
4283       BrCond->eraseFromParent();
4284       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4285       return true;
4286     }
4287 
4288     return false;
4289   }
4290   case Intrinsic::amdgcn_kernarg_segment_ptr:
4291     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4292       // This only makes sense to call in a kernel, so just lower to null.
4293       B.buildConstant(MI.getOperand(0).getReg(), 0);
4294       MI.eraseFromParent();
4295       return true;
4296     }
4297 
4298     return legalizePreloadedArgIntrin(
4299       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4300   case Intrinsic::amdgcn_implicitarg_ptr:
4301     return legalizeImplicitArgPtr(MI, MRI, B);
4302   case Intrinsic::amdgcn_workitem_id_x:
4303     return legalizePreloadedArgIntrin(MI, MRI, B,
4304                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4305   case Intrinsic::amdgcn_workitem_id_y:
4306     return legalizePreloadedArgIntrin(MI, MRI, B,
4307                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4308   case Intrinsic::amdgcn_workitem_id_z:
4309     return legalizePreloadedArgIntrin(MI, MRI, B,
4310                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4311   case Intrinsic::amdgcn_workgroup_id_x:
4312     return legalizePreloadedArgIntrin(MI, MRI, B,
4313                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4314   case Intrinsic::amdgcn_workgroup_id_y:
4315     return legalizePreloadedArgIntrin(MI, MRI, B,
4316                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4317   case Intrinsic::amdgcn_workgroup_id_z:
4318     return legalizePreloadedArgIntrin(MI, MRI, B,
4319                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4320   case Intrinsic::amdgcn_dispatch_ptr:
4321     return legalizePreloadedArgIntrin(MI, MRI, B,
4322                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4323   case Intrinsic::amdgcn_queue_ptr:
4324     return legalizePreloadedArgIntrin(MI, MRI, B,
4325                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4326   case Intrinsic::amdgcn_implicit_buffer_ptr:
4327     return legalizePreloadedArgIntrin(
4328       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4329   case Intrinsic::amdgcn_dispatch_id:
4330     return legalizePreloadedArgIntrin(MI, MRI, B,
4331                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4332   case Intrinsic::amdgcn_fdiv_fast:
4333     return legalizeFDIVFastIntrin(MI, MRI, B);
4334   case Intrinsic::amdgcn_is_shared:
4335     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4336   case Intrinsic::amdgcn_is_private:
4337     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4338   case Intrinsic::amdgcn_wavefrontsize: {
4339     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4340     MI.eraseFromParent();
4341     return true;
4342   }
4343   case Intrinsic::amdgcn_s_buffer_load:
4344     return legalizeSBufferLoad(MI, B, Helper.Observer);
4345   case Intrinsic::amdgcn_raw_buffer_store:
4346   case Intrinsic::amdgcn_struct_buffer_store:
4347     return legalizeBufferStore(MI, MRI, B, false, false);
4348   case Intrinsic::amdgcn_raw_buffer_store_format:
4349   case Intrinsic::amdgcn_struct_buffer_store_format:
4350     return legalizeBufferStore(MI, MRI, B, false, true);
4351   case Intrinsic::amdgcn_raw_tbuffer_store:
4352   case Intrinsic::amdgcn_struct_tbuffer_store:
4353     return legalizeBufferStore(MI, MRI, B, true, true);
4354   case Intrinsic::amdgcn_raw_buffer_load:
4355   case Intrinsic::amdgcn_struct_buffer_load:
4356     return legalizeBufferLoad(MI, MRI, B, false, false);
4357   case Intrinsic::amdgcn_raw_buffer_load_format:
4358   case Intrinsic::amdgcn_struct_buffer_load_format:
4359     return legalizeBufferLoad(MI, MRI, B, true, false);
4360   case Intrinsic::amdgcn_raw_tbuffer_load:
4361   case Intrinsic::amdgcn_struct_tbuffer_load:
4362     return legalizeBufferLoad(MI, MRI, B, true, true);
4363   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4367   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4368   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4369   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4370   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4371   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4372   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4373   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4374   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4375   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4376   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4377   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4378   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4379   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4380   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4381   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4382   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4383   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4384   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4385   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4386   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4387   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4388   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4389     return legalizeBufferAtomic(MI, B, IntrID);
4390   case Intrinsic::amdgcn_atomic_inc:
4391     return legalizeAtomicIncDec(MI, B, true);
4392   case Intrinsic::amdgcn_atomic_dec:
4393     return legalizeAtomicIncDec(MI, B, false);
4394   case Intrinsic::trap:
4395     return legalizeTrapIntrinsic(MI, MRI, B);
4396   case Intrinsic::debugtrap:
4397     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4398   default: {
4399     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4400             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4401       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4402     return true;
4403   }
4404   }
4405 
4406   return true;
4407 }
4408