1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616     FMad.customFor({S32, S16});
617   else if (ST.hasMadMacF32Insts())
618     FMad.customFor({S32});
619   else if (ST.hasMadF16())
620     FMad.customFor({S16});
621   FMad.scalarize(0)
622       .lower();
623 
624   // TODO: Do we need to clamp maximum bitwidth?
625   getActionDefinitionsBuilder(G_TRUNC)
626     .legalIf(isScalar(0))
627     .legalFor({{V2S16, V2S32}})
628     .clampMaxNumElements(0, S16, 2)
629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630     // situations (like an invalid implicit use), we don't want to infinite loop
631     // in the legalizer.
632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633     .alwaysLegal();
634 
635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637                {S32, S1}, {S64, S1}, {S16, S1}})
638     .scalarize(0)
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(1, 32);
641 
642   // TODO: Split s1->s64 during regbankselect for VALU.
643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645     .lowerFor({{S32, S64}})
646     .lowerIf(typeIs(1, S1))
647     .customFor({{S64, S64}});
648   if (ST.has16BitInsts())
649     IToFP.legalFor({{S16, S16}});
650   IToFP.clampScalar(1, S32, S64)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}});
657   if (ST.has16BitInsts())
658     FPToI.legalFor({{S16, S16}});
659   else
660     FPToI.minScalar(1, S32);
661 
662   FPToI.minScalar(0, S32)
663        .scalarize(0)
664        .lower();
665 
666   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667     .scalarize(0)
668     .lower();
669 
670   if (ST.has16BitInsts()) {
671     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672       .legalFor({S16, S32, S64})
673       .clampScalar(0, S16, S64)
674       .scalarize(0);
675   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
676     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
677       .legalFor({S32, S64})
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   } else {
681     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
682       .legalFor({S32})
683       .customFor({S64})
684       .clampScalar(0, S32, S64)
685       .scalarize(0);
686   }
687 
688   // FIXME: Clamp offset operand.
689   getActionDefinitionsBuilder(G_PTR_ADD)
690     .legalIf(isPointer(0))
691     .scalarize(0);
692 
693   getActionDefinitionsBuilder(G_PTRMASK)
694     .legalIf(typeInSet(1, {S64, S32}))
695     .minScalar(1, S32)
696     .maxScalarIf(sizeIs(0, 32), 1, S32)
697     .maxScalarIf(sizeIs(0, 64), 1, S64)
698     .scalarize(0);
699 
700   auto &CmpBuilder =
701     getActionDefinitionsBuilder(G_ICMP)
702     // The compare output type differs based on the register bank of the output,
703     // so make both s1 and s32 legal.
704     //
705     // Scalar compares producing output in scc will be promoted to s32, as that
706     // is the allocatable register type that will be needed for the copy from
707     // scc. This will be promoted during RegBankSelect, and we assume something
708     // before that won't try to use s32 result types.
709     //
710     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711     // bank.
712     .legalForCartesianProduct(
713       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714     .legalForCartesianProduct(
715       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
716   if (ST.has16BitInsts()) {
717     CmpBuilder.legalFor({{S1, S16}});
718   }
719 
720   CmpBuilder
721     .widenScalarToNextPow2(1)
722     .clampScalar(1, S32, S64)
723     .scalarize(0)
724     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
725 
726   getActionDefinitionsBuilder(G_FCMP)
727     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
728     .widenScalarToNextPow2(1)
729     .clampScalar(1, S32, S64)
730     .scalarize(0);
731 
732   // FIXME: fpow has a selection pattern that should move to custom lowering.
733   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734   if (ST.has16BitInsts())
735     Exp2Ops.legalFor({S32, S16});
736   else
737     Exp2Ops.legalFor({S32});
738   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739   Exp2Ops.scalarize(0);
740 
741   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742   if (ST.has16BitInsts())
743     ExpOps.customFor({{S32}, {S16}});
744   else
745     ExpOps.customFor({S32});
746   ExpOps.clampScalar(0, MinScalarFPTy, S32)
747         .scalarize(0);
748 
749   // The 64-bit versions produce 32-bit results, but only on the SALU.
750   getActionDefinitionsBuilder(G_CTPOP)
751     .legalFor({{S32, S32}, {S32, S64}})
752     .clampScalar(0, S32, S32)
753     .clampScalar(1, S32, S64)
754     .scalarize(0)
755     .widenScalarToNextPow2(0, 32)
756     .widenScalarToNextPow2(1, 32);
757 
758   // The hardware instructions return a different result on 0 than the generic
759   // instructions expect. The hardware produces -1, but these produce the
760   // bitwidth.
761   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762     .scalarize(0)
763     .clampScalar(0, S32, S32)
764     .clampScalar(1, S32, S64)
765     .widenScalarToNextPow2(0, 32)
766     .widenScalarToNextPow2(1, 32)
767     .lower();
768 
769   // The 64-bit versions produce 32-bit results, but only on the SALU.
770   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771     .legalFor({{S32, S32}, {S32, S64}})
772     .clampScalar(0, S32, S32)
773     .clampScalar(1, S32, S64)
774     .scalarize(0)
775     .widenScalarToNextPow2(0, 32)
776     .widenScalarToNextPow2(1, 32);
777 
778   getActionDefinitionsBuilder(G_BITREVERSE)
779     .legalFor({S32})
780     .clampScalar(0, S32, S32)
781     .scalarize(0);
782 
783   if (ST.has16BitInsts()) {
784     getActionDefinitionsBuilder(G_BSWAP)
785       .legalFor({S16, S32, V2S16})
786       .clampMaxNumElements(0, S16, 2)
787       // FIXME: Fixing non-power-of-2 before clamp is workaround for
788       // narrowScalar limitation.
789       .widenScalarToNextPow2(0)
790       .clampScalar(0, S16, S32)
791       .scalarize(0);
792 
793     if (ST.hasVOP3PInsts()) {
794       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
795         .legalFor({S32, S16, V2S16})
796         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
797         .clampMaxNumElements(0, S16, 2)
798         .minScalar(0, S16)
799         .widenScalarToNextPow2(0)
800         .scalarize(0)
801         .lower();
802     } else {
803       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
804         .legalFor({S32, S16})
805         .widenScalarToNextPow2(0)
806         .minScalar(0, S16)
807         .scalarize(0)
808         .lower();
809     }
810   } else {
811     // TODO: Should have same legality without v_perm_b32
812     getActionDefinitionsBuilder(G_BSWAP)
813       .legalFor({S32})
814       .lowerIf(scalarNarrowerThan(0, 32))
815       // FIXME: Fixing non-power-of-2 before clamp is workaround for
816       // narrowScalar limitation.
817       .widenScalarToNextPow2(0)
818       .maxScalar(0, S32)
819       .scalarize(0)
820       .lower();
821 
822     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
823       .legalFor({S32})
824       .minScalar(0, S32)
825       .widenScalarToNextPow2(0)
826       .scalarize(0)
827       .lower();
828   }
829 
830   getActionDefinitionsBuilder(G_INTTOPTR)
831     // List the common cases
832     .legalForCartesianProduct(AddrSpaces64, {S64})
833     .legalForCartesianProduct(AddrSpaces32, {S32})
834     .scalarize(0)
835     // Accept any address space as long as the size matches
836     .legalIf(sameSize(0, 1))
837     .widenScalarIf(smallerThan(1, 0),
838       [](const LegalityQuery &Query) {
839         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
840       })
841     .narrowScalarIf(largerThan(1, 0),
842       [](const LegalityQuery &Query) {
843         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
844       });
845 
846   getActionDefinitionsBuilder(G_PTRTOINT)
847     // List the common cases
848     .legalForCartesianProduct(AddrSpaces64, {S64})
849     .legalForCartesianProduct(AddrSpaces32, {S32})
850     .scalarize(0)
851     // Accept any address space as long as the size matches
852     .legalIf(sameSize(0, 1))
853     .widenScalarIf(smallerThan(0, 1),
854       [](const LegalityQuery &Query) {
855         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
856       })
857     .narrowScalarIf(
858       largerThan(0, 1),
859       [](const LegalityQuery &Query) {
860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861       });
862 
863   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
864     .scalarize(0)
865     .custom();
866 
867   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868                                     bool IsLoad) -> bool {
869     const LLT DstTy = Query.Types[0];
870 
871     // Split vector extloads.
872     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873     unsigned Align = Query.MMODescrs[0].AlignInBits;
874 
875     if (MemSize < DstTy.getSizeInBits())
876       MemSize = std::max(MemSize, Align);
877 
878     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
879       return true;
880 
881     const LLT PtrTy = Query.Types[1];
882     unsigned AS = PtrTy.getAddressSpace();
883     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
884       return true;
885 
886     // Catch weird sized loads that don't evenly divide into the access sizes
887     // TODO: May be able to widen depending on alignment etc.
888     unsigned NumRegs = (MemSize + 31) / 32;
889     if (NumRegs == 3) {
890       if (!ST.hasDwordx3LoadStores())
891         return true;
892     } else {
893       // If the alignment allows, these should have been widened.
894       if (!isPowerOf2_32(NumRegs))
895         return true;
896     }
897 
898     if (Align < MemSize) {
899       const SITargetLowering *TLI = ST.getTargetLowering();
900       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
901     }
902 
903     return false;
904   };
905 
906   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907                                          unsigned Opc) -> bool {
908     unsigned Size = Query.Types[0].getSizeInBits();
909     if (isPowerOf2_32(Size))
910       return false;
911 
912     if (Size == 96 && ST.hasDwordx3LoadStores())
913       return false;
914 
915     unsigned AddrSpace = Query.Types[1].getAddressSpace();
916     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917       return false;
918 
919     unsigned Align = Query.MMODescrs[0].AlignInBits;
920     unsigned RoundedSize = NextPowerOf2(Size);
921     return (Align >= RoundedSize);
922   };
923 
924   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
925   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
926   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
927 
928   // TODO: Refine based on subtargets which support unaligned access or 128-bit
929   // LDS
930   // TODO: Unsupported flat for SI.
931 
932   for (unsigned Op : {G_LOAD, G_STORE}) {
933     const bool IsStore = Op == G_STORE;
934 
935     auto &Actions = getActionDefinitionsBuilder(Op);
936     // Explicitly list some common cases.
937     // TODO: Does this help compile time at all?
938     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
939                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
940                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
941                                       {S64, GlobalPtr, 64, GlobalAlign32},
942                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
943                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
944                                       {S32, GlobalPtr, 8, GlobalAlign8},
945                                       {S32, GlobalPtr, 16, GlobalAlign16},
946 
947                                       {S32, LocalPtr, 32, 32},
948                                       {S64, LocalPtr, 64, 32},
949                                       {V2S32, LocalPtr, 64, 32},
950                                       {S32, LocalPtr, 8, 8},
951                                       {S32, LocalPtr, 16, 16},
952                                       {V2S16, LocalPtr, 32, 32},
953 
954                                       {S32, PrivatePtr, 32, 32},
955                                       {S32, PrivatePtr, 8, 8},
956                                       {S32, PrivatePtr, 16, 16},
957                                       {V2S16, PrivatePtr, 32, 32},
958 
959                                       {S32, ConstantPtr, 32, GlobalAlign32},
960                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
961                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
962                                       {S64, ConstantPtr, 64, GlobalAlign32},
963                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
964     Actions.legalIf(
965       [=](const LegalityQuery &Query) -> bool {
966         return isLoadStoreLegal(ST, Query, Op);
967       });
968 
969     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970     // 64-bits.
971     //
972     // TODO: Should generalize bitcast action into coerce, which will also cover
973     // inserting addrspacecasts.
974     Actions.customIf(typeIs(1, Constant32Ptr));
975 
976     // Turn any illegal element vectors into something easier to deal
977     // with. These will ultimately produce 32-bit scalar shifts to extract the
978     // parts anyway.
979     //
980     // For odd 16-bit element vectors, prefer to split those into pieces with
981     // 16-bit vector parts.
982     Actions.bitcastIf(
983       [=](const LegalityQuery &Query) -> bool {
984         const LLT Ty = Query.Types[0];
985         const unsigned Size = Ty.getSizeInBits();
986 
987         if (Size != Query.MMODescrs[0].SizeInBits)
988           return Size <= 32 && Ty.isVector();
989 
990         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991           return true;
992         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
993                !isRegisterVectorElementType(Ty.getElementType());
994       }, bitcastToRegisterType(0));
995 
996     Actions
997         .customIf(typeIs(1, Constant32Ptr))
998         // Widen suitably aligned loads by loading extra elements.
999         .moreElementsIf([=](const LegalityQuery &Query) {
1000             const LLT Ty = Query.Types[0];
1001             return Op == G_LOAD && Ty.isVector() &&
1002                    shouldWidenLoadResult(Query, Op);
1003           }, moreElementsToNextPow2(0))
1004         .widenScalarIf([=](const LegalityQuery &Query) {
1005             const LLT Ty = Query.Types[0];
1006             return Op == G_LOAD && !Ty.isVector() &&
1007                    shouldWidenLoadResult(Query, Op);
1008           }, widenScalarOrEltToNextPow2(0))
1009         .narrowScalarIf(
1010             [=](const LegalityQuery &Query) -> bool {
1011               return !Query.Types[0].isVector() &&
1012                      needToSplitMemOp(Query, Op == G_LOAD);
1013             },
1014             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1015               const LLT DstTy = Query.Types[0];
1016               const LLT PtrTy = Query.Types[1];
1017 
1018               const unsigned DstSize = DstTy.getSizeInBits();
1019               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020 
1021               // Split extloads.
1022               if (DstSize > MemSize)
1023                 return std::make_pair(0, LLT::scalar(MemSize));
1024 
1025               if (!isPowerOf2_32(DstSize)) {
1026                 // We're probably decomposing an odd sized store. Try to split
1027                 // to the widest type. TODO: Account for alignment. As-is it
1028                 // should be OK, since the new parts will be further legalized.
1029                 unsigned FloorSize = PowerOf2Floor(DstSize);
1030                 return std::make_pair(0, LLT::scalar(FloorSize));
1031               }
1032 
1033               if (DstSize > 32 && (DstSize % 32 != 0)) {
1034                 // FIXME: Need a way to specify non-extload of larger size if
1035                 // suitably aligned.
1036                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1037               }
1038 
1039               unsigned MaxSize = maxSizeForAddrSpace(ST,
1040                                                      PtrTy.getAddressSpace(),
1041                                                      Op == G_LOAD);
1042               if (MemSize > MaxSize)
1043                 return std::make_pair(0, LLT::scalar(MaxSize));
1044 
1045               unsigned Align = Query.MMODescrs[0].AlignInBits;
1046               return std::make_pair(0, LLT::scalar(Align));
1047             })
1048         .fewerElementsIf(
1049             [=](const LegalityQuery &Query) -> bool {
1050               return Query.Types[0].isVector() &&
1051                      needToSplitMemOp(Query, Op == G_LOAD);
1052             },
1053             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1054               const LLT DstTy = Query.Types[0];
1055               const LLT PtrTy = Query.Types[1];
1056 
1057               LLT EltTy = DstTy.getElementType();
1058               unsigned MaxSize = maxSizeForAddrSpace(ST,
1059                                                      PtrTy.getAddressSpace(),
1060                                                      Op == G_LOAD);
1061 
1062               // FIXME: Handle widened to power of 2 results better. This ends
1063               // up scalarizing.
1064               // FIXME: 3 element stores scalarized on SI
1065 
1066               // Split if it's too large for the address space.
1067               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1068                 unsigned NumElts = DstTy.getNumElements();
1069                 unsigned EltSize = EltTy.getSizeInBits();
1070 
1071                 if (MaxSize % EltSize == 0) {
1072                   return std::make_pair(
1073                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074                 }
1075 
1076                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1077 
1078                 // FIXME: Refine when odd breakdowns handled
1079                 // The scalars will need to be re-legalized.
1080                 if (NumPieces == 1 || NumPieces >= NumElts ||
1081                     NumElts % NumPieces != 0)
1082                   return std::make_pair(0, EltTy);
1083 
1084                 return std::make_pair(0,
1085                                       LLT::vector(NumElts / NumPieces, EltTy));
1086               }
1087 
1088               // FIXME: We could probably handle weird extending loads better.
1089               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090               if (DstTy.getSizeInBits() > MemSize)
1091                 return std::make_pair(0, EltTy);
1092 
1093               unsigned EltSize = EltTy.getSizeInBits();
1094               unsigned DstSize = DstTy.getSizeInBits();
1095               if (!isPowerOf2_32(DstSize)) {
1096                 // We're probably decomposing an odd sized store. Try to split
1097                 // to the widest type. TODO: Account for alignment. As-is it
1098                 // should be OK, since the new parts will be further legalized.
1099                 unsigned FloorSize = PowerOf2Floor(DstSize);
1100                 return std::make_pair(
1101                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102               }
1103 
1104               // Need to split because of alignment.
1105               unsigned Align = Query.MMODescrs[0].AlignInBits;
1106               if (EltSize > Align &&
1107                   (EltSize / Align < DstTy.getNumElements())) {
1108                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1109               }
1110 
1111               // May need relegalization for the scalars.
1112               return std::make_pair(0, EltTy);
1113             })
1114         .minScalar(0, S32);
1115 
1116     if (IsStore)
1117       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1118 
1119     // TODO: Need a bitcast lower option?
1120     Actions
1121         .widenScalarToNextPow2(0)
1122         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1123   }
1124 
1125   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1126                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1127                                                   {S32, GlobalPtr, 16, 2 * 8},
1128                                                   {S32, LocalPtr, 8, 8},
1129                                                   {S32, LocalPtr, 16, 16},
1130                                                   {S32, PrivatePtr, 8, 8},
1131                                                   {S32, PrivatePtr, 16, 16},
1132                                                   {S32, ConstantPtr, 8, 8},
1133                                                   {S32, ConstantPtr, 16, 2 * 8}});
1134   if (ST.hasFlatAddressSpace()) {
1135     ExtLoads.legalForTypesWithMemDesc(
1136         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1137   }
1138 
1139   ExtLoads.clampScalar(0, S32, S32)
1140           .widenScalarToNextPow2(0)
1141           .unsupportedIfMemSizeNotPow2()
1142           .lower();
1143 
1144   auto &Atomics = getActionDefinitionsBuilder(
1145     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1146      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1147      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148      G_ATOMICRMW_UMIN})
1149     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1150                {S64, GlobalPtr}, {S64, LocalPtr}});
1151   if (ST.hasFlatAddressSpace()) {
1152     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1153   }
1154 
1155   if (ST.hasLDSFPAtomics()) {
1156     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1157       .legalFor({{S32, LocalPtr}});
1158   }
1159 
1160   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161   // demarshalling
1162   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164                 {S32, FlatPtr}, {S64, FlatPtr}})
1165     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166                {S32, RegionPtr}, {S64, RegionPtr}});
1167   // TODO: Pointer types, any 32-bit or 64-bit vector
1168 
1169   // Condition should be s32 for scalar, s1 for vector.
1170   getActionDefinitionsBuilder(G_SELECT)
1171     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1172           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1174     .clampScalar(0, S16, S64)
1175     .scalarize(1)
1176     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1177     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1178     .clampMaxNumElements(0, S32, 2)
1179     .clampMaxNumElements(0, LocalPtr, 2)
1180     .clampMaxNumElements(0, PrivatePtr, 2)
1181     .scalarize(0)
1182     .widenScalarToNextPow2(0)
1183     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1184 
1185   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1186   // be more flexible with the shift amount type.
1187   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1188     .legalFor({{S32, S32}, {S64, S32}});
1189   if (ST.has16BitInsts()) {
1190     if (ST.hasVOP3PInsts()) {
1191       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1192             .clampMaxNumElements(0, S16, 2);
1193     } else
1194       Shifts.legalFor({{S16, S16}});
1195 
1196     // TODO: Support 16-bit shift amounts for all types
1197     Shifts.widenScalarIf(
1198       [=](const LegalityQuery &Query) {
1199         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200         // 32-bit amount.
1201         const LLT ValTy = Query.Types[0];
1202         const LLT AmountTy = Query.Types[1];
1203         return ValTy.getSizeInBits() <= 16 &&
1204                AmountTy.getSizeInBits() < 16;
1205       }, changeTo(1, S16));
1206     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207     Shifts.clampScalar(1, S32, S32);
1208     Shifts.clampScalar(0, S16, S64);
1209     Shifts.widenScalarToNextPow2(0, 16);
1210   } else {
1211     // Make sure we legalize the shift amount type first, as the general
1212     // expansion for the shifted type will produce much worse code if it hasn't
1213     // been truncated already.
1214     Shifts.clampScalar(1, S32, S32);
1215     Shifts.clampScalar(0, S32, S64);
1216     Shifts.widenScalarToNextPow2(0, 32);
1217   }
1218   Shifts.scalarize(0);
1219 
1220   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1221     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1222     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1223     unsigned IdxTypeIdx = 2;
1224 
1225     getActionDefinitionsBuilder(Op)
1226       .customIf([=](const LegalityQuery &Query) {
1227           const LLT EltTy = Query.Types[EltTypeIdx];
1228           const LLT VecTy = Query.Types[VecTypeIdx];
1229           const LLT IdxTy = Query.Types[IdxTypeIdx];
1230           return (EltTy.getSizeInBits() == 16 ||
1231                   EltTy.getSizeInBits() % 32 == 0) &&
1232                  VecTy.getSizeInBits() % 32 == 0 &&
1233                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1234                  IdxTy.getSizeInBits() == 32;
1235         })
1236       .clampScalar(EltTypeIdx, S32, S64)
1237       .clampScalar(VecTypeIdx, S32, S64)
1238       .clampScalar(IdxTypeIdx, S32, S32);
1239   }
1240 
1241   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1242     .unsupportedIf([=](const LegalityQuery &Query) {
1243         const LLT &EltTy = Query.Types[1].getElementType();
1244         return Query.Types[0] != EltTy;
1245       });
1246 
1247   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1248     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1249     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1250 
1251     // FIXME: Doesn't handle extract of illegal sizes.
1252     getActionDefinitionsBuilder(Op)
1253       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1254       // FIXME: Multiples of 16 should not be legal.
1255       .legalIf([=](const LegalityQuery &Query) {
1256           const LLT BigTy = Query.Types[BigTyIdx];
1257           const LLT LitTy = Query.Types[LitTyIdx];
1258           return (BigTy.getSizeInBits() % 32 == 0) &&
1259                  (LitTy.getSizeInBits() % 16 == 0);
1260         })
1261       .widenScalarIf(
1262         [=](const LegalityQuery &Query) {
1263           const LLT BigTy = Query.Types[BigTyIdx];
1264           return (BigTy.getScalarSizeInBits() < 16);
1265         },
1266         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1267       .widenScalarIf(
1268         [=](const LegalityQuery &Query) {
1269           const LLT LitTy = Query.Types[LitTyIdx];
1270           return (LitTy.getScalarSizeInBits() < 16);
1271         },
1272         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1273       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1274       .widenScalarToNextPow2(BigTyIdx, 32);
1275 
1276   }
1277 
1278   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1279     .legalForCartesianProduct(AllS32Vectors, {S32})
1280     .legalForCartesianProduct(AllS64Vectors, {S64})
1281     .clampNumElements(0, V16S32, V32S32)
1282     .clampNumElements(0, V2S64, V16S64)
1283     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1284 
1285   if (ST.hasScalarPackInsts()) {
1286     BuildVector
1287       // FIXME: Should probably widen s1 vectors straight to s32
1288       .minScalarOrElt(0, S16)
1289       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290       .minScalar(1, S32);
1291 
1292     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1293       .legalFor({V2S16, S32})
1294       .lower();
1295     BuildVector.minScalarOrElt(0, S32);
1296   } else {
1297     BuildVector.customFor({V2S16, S16});
1298     BuildVector.minScalarOrElt(0, S32);
1299 
1300     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301       .customFor({V2S16, S32})
1302       .lower();
1303   }
1304 
1305   BuildVector.legalIf(isRegisterType(0));
1306 
1307   // FIXME: Clamp maximum size
1308   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1309     .legalIf(isRegisterType(0));
1310 
1311   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312   // pre-legalize.
1313   if (ST.hasVOP3PInsts()) {
1314     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315       .customFor({V2S16, V2S16})
1316       .lower();
1317   } else
1318     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1319 
1320   // Merge/Unmerge
1321   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1322     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1323     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1324 
1325     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326       const LLT Ty = Query.Types[TypeIdx];
1327       if (Ty.isVector()) {
1328         const LLT &EltTy = Ty.getElementType();
1329         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1330           return true;
1331         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1332           return true;
1333       }
1334       return false;
1335     };
1336 
1337     auto &Builder = getActionDefinitionsBuilder(Op)
1338       .lowerFor({{S16, V2S16}})
1339       .lowerIf([=](const LegalityQuery &Query) {
1340           const LLT BigTy = Query.Types[BigTyIdx];
1341           return BigTy.getSizeInBits() == 32;
1342         })
1343       // Try to widen to s16 first for small types.
1344       // TODO: Only do this on targets with legal s16 shifts
1345       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1346       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1347       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1348       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1349                            elementTypeIs(1, S16)),
1350                        changeTo(1, V2S16))
1351       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353       // valid.
1354       .clampScalar(LitTyIdx, S32, S512)
1355       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1356       // Break up vectors with weird elements into scalars
1357       .fewerElementsIf(
1358         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1359         scalarize(0))
1360       .fewerElementsIf(
1361         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1362         scalarize(1))
1363       .clampScalar(BigTyIdx, S32, MaxScalar);
1364 
1365     if (Op == G_MERGE_VALUES) {
1366       Builder.widenScalarIf(
1367         // TODO: Use 16-bit shifts if legal for 8-bit values?
1368         [=](const LegalityQuery &Query) {
1369           const LLT Ty = Query.Types[LitTyIdx];
1370           return Ty.getSizeInBits() < 32;
1371         },
1372         changeTo(LitTyIdx, S32));
1373     }
1374 
1375     Builder.widenScalarIf(
1376       [=](const LegalityQuery &Query) {
1377         const LLT Ty = Query.Types[BigTyIdx];
1378         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1379           Ty.getSizeInBits() % 16 != 0;
1380       },
1381       [=](const LegalityQuery &Query) {
1382         // Pick the next power of 2, or a multiple of 64 over 128.
1383         // Whichever is smaller.
1384         const LLT &Ty = Query.Types[BigTyIdx];
1385         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1386         if (NewSizeInBits >= 256) {
1387           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1388           if (RoundedTo < NewSizeInBits)
1389             NewSizeInBits = RoundedTo;
1390         }
1391         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1392       })
1393       .legalIf([=](const LegalityQuery &Query) {
1394           const LLT &BigTy = Query.Types[BigTyIdx];
1395           const LLT &LitTy = Query.Types[LitTyIdx];
1396 
1397           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1398             return false;
1399           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1400             return false;
1401 
1402           return BigTy.getSizeInBits() % 16 == 0 &&
1403                  LitTy.getSizeInBits() % 16 == 0 &&
1404                  BigTy.getSizeInBits() <= MaxRegisterSize;
1405         })
1406       // Any vectors left are the wrong size. Scalarize them.
1407       .scalarize(0)
1408       .scalarize(1);
1409   }
1410 
1411   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412   // RegBankSelect.
1413   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414     .legalFor({{S32}, {S64}});
1415 
1416   if (ST.hasVOP3PInsts()) {
1417     SextInReg.lowerFor({{V2S16}})
1418       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419       // get more vector shift opportunities, since we'll get those when
1420       // expanded.
1421       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422   } else if (ST.has16BitInsts()) {
1423     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424   } else {
1425     // Prefer to promote to s32 before lowering if we don't have 16-bit
1426     // shifts. This avoid a lot of intermediate truncate and extend operations.
1427     SextInReg.lowerFor({{S32}, {S64}});
1428   }
1429 
1430   SextInReg
1431     .scalarize(0)
1432     .clampScalar(0, S32, S64)
1433     .lower();
1434 
1435   getActionDefinitionsBuilder(G_FSHR)
1436     .legalFor({{S32, S32}})
1437     .scalarize(0)
1438     .lower();
1439 
1440   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1441     .legalFor({S64});
1442 
1443   getActionDefinitionsBuilder({
1444       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1445       G_FCOPYSIGN,
1446 
1447       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1448       G_READ_REGISTER,
1449       G_WRITE_REGISTER,
1450 
1451       G_SADDO, G_SSUBO,
1452 
1453        // TODO: Implement
1454       G_FMINIMUM, G_FMAXIMUM,
1455       G_FSHL
1456     }).lower();
1457 
1458   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1459         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1460         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1461     .unsupported();
1462 
1463   computeTables();
1464   verify(*ST.getInstrInfo());
1465 }
1466 
1467 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1468                                          MachineInstr &MI) const {
1469   MachineIRBuilder &B = Helper.MIRBuilder;
1470   MachineRegisterInfo &MRI = *B.getMRI();
1471   GISelChangeObserver &Observer = Helper.Observer;
1472 
1473   switch (MI.getOpcode()) {
1474   case TargetOpcode::G_ADDRSPACE_CAST:
1475     return legalizeAddrSpaceCast(MI, MRI, B);
1476   case TargetOpcode::G_FRINT:
1477     return legalizeFrint(MI, MRI, B);
1478   case TargetOpcode::G_FCEIL:
1479     return legalizeFceil(MI, MRI, B);
1480   case TargetOpcode::G_INTRINSIC_TRUNC:
1481     return legalizeIntrinsicTrunc(MI, MRI, B);
1482   case TargetOpcode::G_SITOFP:
1483     return legalizeITOFP(MI, MRI, B, true);
1484   case TargetOpcode::G_UITOFP:
1485     return legalizeITOFP(MI, MRI, B, false);
1486   case TargetOpcode::G_FPTOSI:
1487     return legalizeFPTOI(MI, MRI, B, true);
1488   case TargetOpcode::G_FPTOUI:
1489     return legalizeFPTOI(MI, MRI, B, false);
1490   case TargetOpcode::G_FMINNUM:
1491   case TargetOpcode::G_FMAXNUM:
1492   case TargetOpcode::G_FMINNUM_IEEE:
1493   case TargetOpcode::G_FMAXNUM_IEEE:
1494     return legalizeMinNumMaxNum(Helper, MI);
1495   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1496     return legalizeExtractVectorElt(MI, MRI, B);
1497   case TargetOpcode::G_INSERT_VECTOR_ELT:
1498     return legalizeInsertVectorElt(MI, MRI, B);
1499   case TargetOpcode::G_SHUFFLE_VECTOR:
1500     return legalizeShuffleVector(MI, MRI, B);
1501   case TargetOpcode::G_FSIN:
1502   case TargetOpcode::G_FCOS:
1503     return legalizeSinCos(MI, MRI, B);
1504   case TargetOpcode::G_GLOBAL_VALUE:
1505     return legalizeGlobalValue(MI, MRI, B);
1506   case TargetOpcode::G_LOAD:
1507     return legalizeLoad(MI, MRI, B, Observer);
1508   case TargetOpcode::G_FMAD:
1509     return legalizeFMad(MI, MRI, B);
1510   case TargetOpcode::G_FDIV:
1511     return legalizeFDIV(MI, MRI, B);
1512   case TargetOpcode::G_UDIV:
1513   case TargetOpcode::G_UREM:
1514     return legalizeUDIV_UREM(MI, MRI, B);
1515   case TargetOpcode::G_SDIV:
1516   case TargetOpcode::G_SREM:
1517     return legalizeSDIV_SREM(MI, MRI, B);
1518   case TargetOpcode::G_ATOMIC_CMPXCHG:
1519     return legalizeAtomicCmpXChg(MI, MRI, B);
1520   case TargetOpcode::G_FLOG:
1521     return legalizeFlog(MI, B, numbers::ln2f);
1522   case TargetOpcode::G_FLOG10:
1523     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1524   case TargetOpcode::G_FEXP:
1525     return legalizeFExp(MI, B);
1526   case TargetOpcode::G_FPOW:
1527     return legalizeFPow(MI, B);
1528   case TargetOpcode::G_FFLOOR:
1529     return legalizeFFloor(MI, MRI, B);
1530   case TargetOpcode::G_BUILD_VECTOR:
1531     return legalizeBuildVector(MI, MRI, B);
1532   default:
1533     return false;
1534   }
1535 
1536   llvm_unreachable("expected switch to return");
1537 }
1538 
1539 Register AMDGPULegalizerInfo::getSegmentAperture(
1540   unsigned AS,
1541   MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B) const {
1543   MachineFunction &MF = B.getMF();
1544   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1545   const LLT S32 = LLT::scalar(32);
1546 
1547   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1548 
1549   if (ST.hasApertureRegs()) {
1550     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1551     // getreg.
1552     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1553         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1554         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1555     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1556         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1557         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1558     unsigned Encoding =
1559         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1560         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1561         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1562 
1563     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1564 
1565     B.buildInstr(AMDGPU::S_GETREG_B32)
1566       .addDef(GetReg)
1567       .addImm(Encoding);
1568     MRI.setType(GetReg, S32);
1569 
1570     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1571     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1572   }
1573 
1574   Register QueuePtr = MRI.createGenericVirtualRegister(
1575     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1576 
1577   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1578   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1579     return Register();
1580 
1581   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1582   // private_segment_aperture_base_hi.
1583   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1584 
1585   // TODO: can we be smarter about machine pointer info?
1586   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1587   MachineMemOperand *MMO = MF.getMachineMemOperand(
1588       PtrInfo,
1589       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1590           MachineMemOperand::MOInvariant,
1591       4, commonAlignment(Align(64), StructOffset));
1592 
1593   Register LoadAddr;
1594 
1595   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1596   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1597 }
1598 
1599 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1600   MachineInstr &MI, MachineRegisterInfo &MRI,
1601   MachineIRBuilder &B) const {
1602   MachineFunction &MF = B.getMF();
1603 
1604   const LLT S32 = LLT::scalar(32);
1605   Register Dst = MI.getOperand(0).getReg();
1606   Register Src = MI.getOperand(1).getReg();
1607 
1608   LLT DstTy = MRI.getType(Dst);
1609   LLT SrcTy = MRI.getType(Src);
1610   unsigned DestAS = DstTy.getAddressSpace();
1611   unsigned SrcAS = SrcTy.getAddressSpace();
1612 
1613   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1614   // vector element.
1615   assert(!DstTy.isVector());
1616 
1617   const AMDGPUTargetMachine &TM
1618     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1619 
1620   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1621   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1622     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1623     return true;
1624   }
1625 
1626   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1627     // Truncate.
1628     B.buildExtract(Dst, Src, 0);
1629     MI.eraseFromParent();
1630     return true;
1631   }
1632 
1633   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1634     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1635     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1636 
1637     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1638     // another. Merge operands are required to be the same type, but creating an
1639     // extra ptrtoint would be kind of pointless.
1640     auto HighAddr = B.buildConstant(
1641       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1642     B.buildMerge(Dst, {Src, HighAddr});
1643     MI.eraseFromParent();
1644     return true;
1645   }
1646 
1647   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1648     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1649            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1650     unsigned NullVal = TM.getNullPointerValue(DestAS);
1651 
1652     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1653     auto FlatNull = B.buildConstant(SrcTy, 0);
1654 
1655     // Extract low 32-bits of the pointer.
1656     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1657 
1658     auto CmpRes =
1659         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1660     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1661 
1662     MI.eraseFromParent();
1663     return true;
1664   }
1665 
1666   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1667     return false;
1668 
1669   if (!ST.hasFlatAddressSpace())
1670     return false;
1671 
1672   auto SegmentNull =
1673       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1674   auto FlatNull =
1675       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1676 
1677   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1678   if (!ApertureReg.isValid())
1679     return false;
1680 
1681   auto CmpRes =
1682       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1683 
1684   // Coerce the type of the low half of the result so we can use merge_values.
1685   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1686 
1687   // TODO: Should we allow mismatched types but matching sizes in merges to
1688   // avoid the ptrtoint?
1689   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1690   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1691 
1692   MI.eraseFromParent();
1693   return true;
1694 }
1695 
1696 bool AMDGPULegalizerInfo::legalizeFrint(
1697   MachineInstr &MI, MachineRegisterInfo &MRI,
1698   MachineIRBuilder &B) const {
1699   Register Src = MI.getOperand(1).getReg();
1700   LLT Ty = MRI.getType(Src);
1701   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1702 
1703   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1704   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1705 
1706   auto C1 = B.buildFConstant(Ty, C1Val);
1707   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1708 
1709   // TODO: Should this propagate fast-math-flags?
1710   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1711   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1712 
1713   auto C2 = B.buildFConstant(Ty, C2Val);
1714   auto Fabs = B.buildFAbs(Ty, Src);
1715 
1716   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1717   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1718   return true;
1719 }
1720 
1721 bool AMDGPULegalizerInfo::legalizeFceil(
1722   MachineInstr &MI, MachineRegisterInfo &MRI,
1723   MachineIRBuilder &B) const {
1724 
1725   const LLT S1 = LLT::scalar(1);
1726   const LLT S64 = LLT::scalar(64);
1727 
1728   Register Src = MI.getOperand(1).getReg();
1729   assert(MRI.getType(Src) == S64);
1730 
1731   // result = trunc(src)
1732   // if (src > 0.0 && src != result)
1733   //   result += 1.0
1734 
1735   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1736 
1737   const auto Zero = B.buildFConstant(S64, 0.0);
1738   const auto One = B.buildFConstant(S64, 1.0);
1739   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1740   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1741   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1742   auto Add = B.buildSelect(S64, And, One, Zero);
1743 
1744   // TODO: Should this propagate fast-math-flags?
1745   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1746   return true;
1747 }
1748 
1749 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1750                                               MachineIRBuilder &B) {
1751   const unsigned FractBits = 52;
1752   const unsigned ExpBits = 11;
1753   LLT S32 = LLT::scalar(32);
1754 
1755   auto Const0 = B.buildConstant(S32, FractBits - 32);
1756   auto Const1 = B.buildConstant(S32, ExpBits);
1757 
1758   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1759     .addUse(Const0.getReg(0))
1760     .addUse(Const1.getReg(0));
1761 
1762   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1763 }
1764 
1765 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1766   MachineInstr &MI, MachineRegisterInfo &MRI,
1767   MachineIRBuilder &B) const {
1768   const LLT S1 = LLT::scalar(1);
1769   const LLT S32 = LLT::scalar(32);
1770   const LLT S64 = LLT::scalar(64);
1771 
1772   Register Src = MI.getOperand(1).getReg();
1773   assert(MRI.getType(Src) == S64);
1774 
1775   // TODO: Should this use extract since the low half is unused?
1776   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1777   Register Hi = Unmerge.getReg(1);
1778 
1779   // Extract the upper half, since this is where we will find the sign and
1780   // exponent.
1781   auto Exp = extractF64Exponent(Hi, B);
1782 
1783   const unsigned FractBits = 52;
1784 
1785   // Extract the sign bit.
1786   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1787   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1788 
1789   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1790 
1791   const auto Zero32 = B.buildConstant(S32, 0);
1792 
1793   // Extend back to 64-bits.
1794   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1795 
1796   auto Shr = B.buildAShr(S64, FractMask, Exp);
1797   auto Not = B.buildNot(S64, Shr);
1798   auto Tmp0 = B.buildAnd(S64, Src, Not);
1799   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1800 
1801   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1802   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1803 
1804   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1805   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1806   return true;
1807 }
1808 
1809 bool AMDGPULegalizerInfo::legalizeITOFP(
1810   MachineInstr &MI, MachineRegisterInfo &MRI,
1811   MachineIRBuilder &B, bool Signed) const {
1812 
1813   Register Dst = MI.getOperand(0).getReg();
1814   Register Src = MI.getOperand(1).getReg();
1815 
1816   const LLT S64 = LLT::scalar(64);
1817   const LLT S32 = LLT::scalar(32);
1818 
1819   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1820 
1821   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1822 
1823   auto CvtHi = Signed ?
1824     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1825     B.buildUITOFP(S64, Unmerge.getReg(1));
1826 
1827   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1828 
1829   auto ThirtyTwo = B.buildConstant(S32, 32);
1830   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1831     .addUse(CvtHi.getReg(0))
1832     .addUse(ThirtyTwo.getReg(0));
1833 
1834   // TODO: Should this propagate fast-math-flags?
1835   B.buildFAdd(Dst, LdExp, CvtLo);
1836   MI.eraseFromParent();
1837   return true;
1838 }
1839 
1840 // TODO: Copied from DAG implementation. Verify logic and document how this
1841 // actually works.
1842 bool AMDGPULegalizerInfo::legalizeFPTOI(
1843   MachineInstr &MI, MachineRegisterInfo &MRI,
1844   MachineIRBuilder &B, bool Signed) const {
1845 
1846   Register Dst = MI.getOperand(0).getReg();
1847   Register Src = MI.getOperand(1).getReg();
1848 
1849   const LLT S64 = LLT::scalar(64);
1850   const LLT S32 = LLT::scalar(32);
1851 
1852   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1853 
1854   unsigned Flags = MI.getFlags();
1855 
1856   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1857   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1858   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1859 
1860   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1861   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1862   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1863 
1864   auto Hi = Signed ?
1865     B.buildFPTOSI(S32, FloorMul) :
1866     B.buildFPTOUI(S32, FloorMul);
1867   auto Lo = B.buildFPTOUI(S32, Fma);
1868 
1869   B.buildMerge(Dst, { Lo, Hi });
1870   MI.eraseFromParent();
1871 
1872   return true;
1873 }
1874 
1875 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1876                                                MachineInstr &MI) const {
1877   MachineFunction &MF = Helper.MIRBuilder.getMF();
1878   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1879 
1880   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1881                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1882 
1883   // With ieee_mode disabled, the instructions have the correct behavior
1884   // already for G_FMINNUM/G_FMAXNUM
1885   if (!MFI->getMode().IEEE)
1886     return !IsIEEEOp;
1887 
1888   if (IsIEEEOp)
1889     return true;
1890 
1891   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1892 }
1893 
1894 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1895   MachineInstr &MI, MachineRegisterInfo &MRI,
1896   MachineIRBuilder &B) const {
1897   // TODO: Should move some of this into LegalizerHelper.
1898 
1899   // TODO: Promote dynamic indexing of s16 to s32
1900 
1901   // FIXME: Artifact combiner probably should have replaced the truncated
1902   // constant before this, so we shouldn't need
1903   // getConstantVRegValWithLookThrough.
1904   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1905     MI.getOperand(2).getReg(), MRI);
1906   if (!IdxVal) // Dynamic case will be selected to register indexing.
1907     return true;
1908 
1909   Register Dst = MI.getOperand(0).getReg();
1910   Register Vec = MI.getOperand(1).getReg();
1911 
1912   LLT VecTy = MRI.getType(Vec);
1913   LLT EltTy = VecTy.getElementType();
1914   assert(EltTy == MRI.getType(Dst));
1915 
1916   if (IdxVal->Value < VecTy.getNumElements())
1917     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1918   else
1919     B.buildUndef(Dst);
1920 
1921   MI.eraseFromParent();
1922   return true;
1923 }
1924 
1925 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   // TODO: Should move some of this into LegalizerHelper.
1929 
1930   // TODO: Promote dynamic indexing of s16 to s32
1931 
1932   // FIXME: Artifact combiner probably should have replaced the truncated
1933   // constant before this, so we shouldn't need
1934   // getConstantVRegValWithLookThrough.
1935   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1936     MI.getOperand(3).getReg(), MRI);
1937   if (!IdxVal) // Dynamic case will be selected to register indexing.
1938     return true;
1939 
1940   Register Dst = MI.getOperand(0).getReg();
1941   Register Vec = MI.getOperand(1).getReg();
1942   Register Ins = MI.getOperand(2).getReg();
1943 
1944   LLT VecTy = MRI.getType(Vec);
1945   LLT EltTy = VecTy.getElementType();
1946   assert(EltTy == MRI.getType(Ins));
1947 
1948   if (IdxVal->Value < VecTy.getNumElements())
1949     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1950   else
1951     B.buildUndef(Dst);
1952 
1953   MI.eraseFromParent();
1954   return true;
1955 }
1956 
1957 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B) const {
1960   const LLT V2S16 = LLT::vector(2, 16);
1961 
1962   Register Dst = MI.getOperand(0).getReg();
1963   Register Src0 = MI.getOperand(1).getReg();
1964   LLT DstTy = MRI.getType(Dst);
1965   LLT SrcTy = MRI.getType(Src0);
1966 
1967   if (SrcTy == V2S16 && DstTy == V2S16 &&
1968       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1969     return true;
1970 
1971   MachineIRBuilder HelperBuilder(MI);
1972   GISelObserverWrapper DummyObserver;
1973   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1974   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeSinCos(
1978   MachineInstr &MI, MachineRegisterInfo &MRI,
1979   MachineIRBuilder &B) const {
1980 
1981   Register DstReg = MI.getOperand(0).getReg();
1982   Register SrcReg = MI.getOperand(1).getReg();
1983   LLT Ty = MRI.getType(DstReg);
1984   unsigned Flags = MI.getFlags();
1985 
1986   Register TrigVal;
1987   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1988   if (ST.hasTrigReducedRange()) {
1989     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1990     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1991       .addUse(MulVal.getReg(0))
1992       .setMIFlags(Flags).getReg(0);
1993   } else
1994     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1995 
1996   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1997     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1998   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1999     .addUse(TrigVal)
2000     .setMIFlags(Flags);
2001   MI.eraseFromParent();
2002   return true;
2003 }
2004 
2005 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2006                                                   MachineIRBuilder &B,
2007                                                   const GlobalValue *GV,
2008                                                   int64_t Offset,
2009                                                   unsigned GAFlags) const {
2010   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2011   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2012   // to the following code sequence:
2013   //
2014   // For constant address space:
2015   //   s_getpc_b64 s[0:1]
2016   //   s_add_u32 s0, s0, $symbol
2017   //   s_addc_u32 s1, s1, 0
2018   //
2019   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2020   //   a fixup or relocation is emitted to replace $symbol with a literal
2021   //   constant, which is a pc-relative offset from the encoding of the $symbol
2022   //   operand to the global variable.
2023   //
2024   // For global address space:
2025   //   s_getpc_b64 s[0:1]
2026   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2027   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2028   //
2029   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2030   //   fixups or relocations are emitted to replace $symbol@*@lo and
2031   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2032   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2033   //   operand to the global variable.
2034   //
2035   // What we want here is an offset from the value returned by s_getpc
2036   // (which is the address of the s_add_u32 instruction) to the global
2037   // variable, but since the encoding of $symbol starts 4 bytes after the start
2038   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2039   // small. This requires us to add 4 to the global variable offset in order to
2040   // compute the correct address.
2041 
2042   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2043 
2044   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2045     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2046 
2047   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2048     .addDef(PCReg);
2049 
2050   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2051   if (GAFlags == SIInstrInfo::MO_NONE)
2052     MIB.addImm(0);
2053   else
2054     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2055 
2056   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2057 
2058   if (PtrTy.getSizeInBits() == 32)
2059     B.buildExtract(DstReg, PCReg, 0);
2060   return true;
2061  }
2062 
2063 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2064   MachineInstr &MI, MachineRegisterInfo &MRI,
2065   MachineIRBuilder &B) const {
2066   Register DstReg = MI.getOperand(0).getReg();
2067   LLT Ty = MRI.getType(DstReg);
2068   unsigned AS = Ty.getAddressSpace();
2069 
2070   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2071   MachineFunction &MF = B.getMF();
2072   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2073 
2074   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2075     if (!MFI->isEntryFunction()) {
2076       const Function &Fn = MF.getFunction();
2077       DiagnosticInfoUnsupported BadLDSDecl(
2078         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2079         DS_Warning);
2080       Fn.getContext().diagnose(BadLDSDecl);
2081 
2082       // We currently don't have a way to correctly allocate LDS objects that
2083       // aren't directly associated with a kernel. We do force inlining of
2084       // functions that use local objects. However, if these dead functions are
2085       // not eliminated, we don't want a compile time error. Just emit a warning
2086       // and a trap, since there should be no callable path here.
2087       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2088       B.buildUndef(DstReg);
2089       MI.eraseFromParent();
2090       return true;
2091     }
2092 
2093     // TODO: We could emit code to handle the initialization somewhere.
2094     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2095       const SITargetLowering *TLI = ST.getTargetLowering();
2096       if (!TLI->shouldUseLDSConstAddress(GV)) {
2097         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2098         return true; // Leave in place;
2099       }
2100 
2101       B.buildConstant(
2102           DstReg,
2103           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2104       MI.eraseFromParent();
2105       return true;
2106     }
2107 
2108     const Function &Fn = MF.getFunction();
2109     DiagnosticInfoUnsupported BadInit(
2110       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2111     Fn.getContext().diagnose(BadInit);
2112     return true;
2113   }
2114 
2115   const SITargetLowering *TLI = ST.getTargetLowering();
2116 
2117   if (TLI->shouldEmitFixup(GV)) {
2118     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2119     MI.eraseFromParent();
2120     return true;
2121   }
2122 
2123   if (TLI->shouldEmitPCReloc(GV)) {
2124     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2125     MI.eraseFromParent();
2126     return true;
2127   }
2128 
2129   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2130   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2131 
2132   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2133       MachinePointerInfo::getGOT(MF),
2134       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2135           MachineMemOperand::MOInvariant,
2136       8 /*Size*/, Align(8));
2137 
2138   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2139 
2140   if (Ty.getSizeInBits() == 32) {
2141     // Truncate if this is a 32-bit constant adrdess.
2142     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2143     B.buildExtract(DstReg, Load, 0);
2144   } else
2145     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2146 
2147   MI.eraseFromParent();
2148   return true;
2149 }
2150 
2151 bool AMDGPULegalizerInfo::legalizeLoad(
2152   MachineInstr &MI, MachineRegisterInfo &MRI,
2153   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2154   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2155   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2156   Observer.changingInstr(MI);
2157   MI.getOperand(1).setReg(Cast.getReg(0));
2158   Observer.changedInstr(MI);
2159   return true;
2160 }
2161 
2162 bool AMDGPULegalizerInfo::legalizeFMad(
2163   MachineInstr &MI, MachineRegisterInfo &MRI,
2164   MachineIRBuilder &B) const {
2165   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2166   assert(Ty.isScalar());
2167 
2168   MachineFunction &MF = B.getMF();
2169   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2170 
2171   // TODO: Always legal with future ftz flag.
2172   // FIXME: Do we need just output?
2173   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2174     return true;
2175   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2176     return true;
2177 
2178   MachineIRBuilder HelperBuilder(MI);
2179   GISelObserverWrapper DummyObserver;
2180   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2181   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2185   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2186   Register DstReg = MI.getOperand(0).getReg();
2187   Register PtrReg = MI.getOperand(1).getReg();
2188   Register CmpVal = MI.getOperand(2).getReg();
2189   Register NewVal = MI.getOperand(3).getReg();
2190 
2191   assert(SITargetLowering::isFlatGlobalAddrSpace(
2192            MRI.getType(PtrReg).getAddressSpace()) &&
2193          "this should not have been custom lowered");
2194 
2195   LLT ValTy = MRI.getType(CmpVal);
2196   LLT VecTy = LLT::vector(2, ValTy);
2197 
2198   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2199 
2200   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2201     .addDef(DstReg)
2202     .addUse(PtrReg)
2203     .addUse(PackedVal)
2204     .setMemRefs(MI.memoperands());
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeFlog(
2211   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2212   Register Dst = MI.getOperand(0).getReg();
2213   Register Src = MI.getOperand(1).getReg();
2214   LLT Ty = B.getMRI()->getType(Dst);
2215   unsigned Flags = MI.getFlags();
2216 
2217   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2218   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2219 
2220   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2221   MI.eraseFromParent();
2222   return true;
2223 }
2224 
2225 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2226                                        MachineIRBuilder &B) const {
2227   Register Dst = MI.getOperand(0).getReg();
2228   Register Src = MI.getOperand(1).getReg();
2229   unsigned Flags = MI.getFlags();
2230   LLT Ty = B.getMRI()->getType(Dst);
2231 
2232   auto K = B.buildFConstant(Ty, numbers::log2e);
2233   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2234   B.buildFExp2(Dst, Mul, Flags);
2235   MI.eraseFromParent();
2236   return true;
2237 }
2238 
2239 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2240                                        MachineIRBuilder &B) const {
2241   Register Dst = MI.getOperand(0).getReg();
2242   Register Src0 = MI.getOperand(1).getReg();
2243   Register Src1 = MI.getOperand(2).getReg();
2244   unsigned Flags = MI.getFlags();
2245   LLT Ty = B.getMRI()->getType(Dst);
2246   const LLT S16 = LLT::scalar(16);
2247   const LLT S32 = LLT::scalar(32);
2248 
2249   if (Ty == S32) {
2250     auto Log = B.buildFLog2(S32, Src0, Flags);
2251     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2252       .addUse(Log.getReg(0))
2253       .addUse(Src1)
2254       .setMIFlags(Flags);
2255     B.buildFExp2(Dst, Mul, Flags);
2256   } else if (Ty == S16) {
2257     // There's no f16 fmul_legacy, so we need to convert for it.
2258     auto Log = B.buildFLog2(S16, Src0, Flags);
2259     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2260     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2261     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2262       .addUse(Ext0.getReg(0))
2263       .addUse(Ext1.getReg(0))
2264       .setMIFlags(Flags);
2265 
2266     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2267   } else
2268     return false;
2269 
2270   MI.eraseFromParent();
2271   return true;
2272 }
2273 
2274 // Find a source register, ignoring any possible source modifiers.
2275 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2276   Register ModSrc = OrigSrc;
2277   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2278     ModSrc = SrcFNeg->getOperand(1).getReg();
2279     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2280       ModSrc = SrcFAbs->getOperand(1).getReg();
2281   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2282     ModSrc = SrcFAbs->getOperand(1).getReg();
2283   return ModSrc;
2284 }
2285 
2286 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2287                                          MachineRegisterInfo &MRI,
2288                                          MachineIRBuilder &B) const {
2289 
2290   const LLT S1 = LLT::scalar(1);
2291   const LLT S64 = LLT::scalar(64);
2292   Register Dst = MI.getOperand(0).getReg();
2293   Register OrigSrc = MI.getOperand(1).getReg();
2294   unsigned Flags = MI.getFlags();
2295   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2296          "this should not have been custom lowered");
2297 
2298   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2299   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2300   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2301   // V_FRACT bug is:
2302   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2303   //
2304   // Convert floor(x) to (x - fract(x))
2305 
2306   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2307     .addUse(OrigSrc)
2308     .setMIFlags(Flags);
2309 
2310   // Give source modifier matching some assistance before obscuring a foldable
2311   // pattern.
2312 
2313   // TODO: We can avoid the neg on the fract? The input sign to fract
2314   // shouldn't matter?
2315   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2316 
2317   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2318 
2319   Register Min = MRI.createGenericVirtualRegister(S64);
2320 
2321   // We don't need to concern ourselves with the snan handling difference, so
2322   // use the one which will directly select.
2323   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2324   if (MFI->getMode().IEEE)
2325     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2326   else
2327     B.buildFMinNum(Min, Fract, Const, Flags);
2328 
2329   Register CorrectedFract = Min;
2330   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2331     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2332     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2333   }
2334 
2335   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2336   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2337 
2338   MI.eraseFromParent();
2339   return true;
2340 }
2341 
2342 // Turn an illegal packed v2s16 build vector into bit operations.
2343 // TODO: This should probably be a bitcast action in LegalizerHelper.
2344 bool AMDGPULegalizerInfo::legalizeBuildVector(
2345   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2346   Register Dst = MI.getOperand(0).getReg();
2347   const LLT S32 = LLT::scalar(32);
2348   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2349 
2350   Register Src0 = MI.getOperand(1).getReg();
2351   Register Src1 = MI.getOperand(2).getReg();
2352   assert(MRI.getType(Src0) == LLT::scalar(16));
2353 
2354   auto Merge = B.buildMerge(S32, {Src0, Src1});
2355   B.buildBitcast(Dst, Merge);
2356 
2357   MI.eraseFromParent();
2358   return true;
2359 }
2360 
2361 // Return the use branch instruction, otherwise null if the usage is invalid.
2362 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2363                                        MachineRegisterInfo &MRI,
2364                                        MachineInstr *&Br,
2365                                        MachineBasicBlock *&UncondBrTarget) {
2366   Register CondDef = MI.getOperand(0).getReg();
2367   if (!MRI.hasOneNonDBGUse(CondDef))
2368     return nullptr;
2369 
2370   MachineBasicBlock *Parent = MI.getParent();
2371   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2372   if (UseMI.getParent() != Parent ||
2373       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2374     return nullptr;
2375 
2376   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2377   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2378   if (Next == Parent->end()) {
2379     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2380     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2381       return nullptr;
2382     UncondBrTarget = &*NextMBB;
2383   } else {
2384     if (Next->getOpcode() != AMDGPU::G_BR)
2385       return nullptr;
2386     Br = &*Next;
2387     UncondBrTarget = Br->getOperand(0).getMBB();
2388   }
2389 
2390   return &UseMI;
2391 }
2392 
2393 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2394                                                MachineRegisterInfo &MRI,
2395                                                Register LiveIn,
2396                                                Register PhyReg) const {
2397   assert(PhyReg.isPhysical() && "Physical register expected");
2398 
2399   // Insert the live-in copy, if required, by defining destination virtual
2400   // register.
2401   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2402   if (!MRI.getVRegDef(LiveIn)) {
2403     // FIXME: Should have scoped insert pt
2404     MachineBasicBlock &OrigInsBB = B.getMBB();
2405     auto OrigInsPt = B.getInsertPt();
2406 
2407     MachineBasicBlock &EntryMBB = B.getMF().front();
2408     EntryMBB.addLiveIn(PhyReg);
2409     B.setInsertPt(EntryMBB, EntryMBB.begin());
2410     B.buildCopy(LiveIn, PhyReg);
2411 
2412     B.setInsertPt(OrigInsBB, OrigInsPt);
2413   }
2414 
2415   return LiveIn;
2416 }
2417 
2418 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2419                                                 MachineRegisterInfo &MRI,
2420                                                 Register PhyReg, LLT Ty,
2421                                                 bool InsertLiveInCopy) const {
2422   assert(PhyReg.isPhysical() && "Physical register expected");
2423 
2424   // Get or create virtual live-in regester
2425   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2426   if (!LiveIn) {
2427     LiveIn = MRI.createGenericVirtualRegister(Ty);
2428     MRI.addLiveIn(PhyReg, LiveIn);
2429   }
2430 
2431   // When the actual true copy required is from virtual register to physical
2432   // register (to be inserted later), live-in copy insertion from physical
2433   // to register virtual register is not required
2434   if (!InsertLiveInCopy)
2435     return LiveIn;
2436 
2437   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2438 }
2439 
2440 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2441     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2442   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2443   const ArgDescriptor *Arg;
2444   const TargetRegisterClass *RC;
2445   LLT ArgTy;
2446   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2447   if (!Arg) {
2448     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2449     return nullptr;
2450   }
2451   return Arg;
2452 }
2453 
2454 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2455                                          const ArgDescriptor *Arg) const {
2456   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2457     return false; // TODO: Handle these
2458 
2459   Register SrcReg = Arg->getRegister();
2460   assert(SrcReg.isPhysical() && "Physical register expected");
2461   assert(DstReg.isVirtual() && "Virtual register expected");
2462 
2463   MachineRegisterInfo &MRI = *B.getMRI();
2464 
2465   LLT Ty = MRI.getType(DstReg);
2466   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2467 
2468   if (Arg->isMasked()) {
2469     // TODO: Should we try to emit this once in the entry block?
2470     const LLT S32 = LLT::scalar(32);
2471     const unsigned Mask = Arg->getMask();
2472     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2473 
2474     Register AndMaskSrc = LiveIn;
2475 
2476     if (Shift != 0) {
2477       auto ShiftAmt = B.buildConstant(S32, Shift);
2478       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2479     }
2480 
2481     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2482   } else {
2483     B.buildCopy(DstReg, LiveIn);
2484   }
2485 
2486   return true;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2490     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2491     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2492 
2493   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2494   if (!Arg)
2495     return false;
2496 
2497   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2498     return false;
2499 
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2505                                        MachineRegisterInfo &MRI,
2506                                        MachineIRBuilder &B) const {
2507   Register Dst = MI.getOperand(0).getReg();
2508   LLT DstTy = MRI.getType(Dst);
2509   LLT S16 = LLT::scalar(16);
2510   LLT S32 = LLT::scalar(32);
2511   LLT S64 = LLT::scalar(64);
2512 
2513   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2514     return true;
2515 
2516   if (DstTy == S16)
2517     return legalizeFDIV16(MI, MRI, B);
2518   if (DstTy == S32)
2519     return legalizeFDIV32(MI, MRI, B);
2520   if (DstTy == S64)
2521     return legalizeFDIV64(MI, MRI, B);
2522 
2523   return false;
2524 }
2525 
2526 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2527   const LLT S32 = LLT::scalar(32);
2528 
2529   auto Cvt0 = B.buildUITOFP(S32, Src);
2530   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2531   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2532   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2533   return B.buildFPTOUI(S32, Mul).getReg(0);
2534 }
2535 
2536 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2537                                                   Register DstReg,
2538                                                   Register Num,
2539                                                   Register Den,
2540                                                   bool IsDiv) const {
2541   const LLT S1 = LLT::scalar(1);
2542   const LLT S32 = LLT::scalar(32);
2543 
2544   // RCP =  URECIP(Den) = 2^32 / Den + e
2545   // e is rounding error.
2546   auto RCP = buildDivRCP(B, Den);
2547 
2548   // RCP_LO = mul(RCP, Den)
2549   auto RCP_LO = B.buildMul(S32, RCP, Den);
2550 
2551   // RCP_HI = mulhu (RCP, Den) */
2552   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2553 
2554   // NEG_RCP_LO = -RCP_LO
2555   auto Zero = B.buildConstant(S32, 0);
2556   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2557 
2558   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2559   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2560   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2561 
2562   // Calculate the rounding error from the URECIP instruction
2563   // E = mulhu(ABS_RCP_LO, RCP)
2564   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2565 
2566   // RCP_A_E = RCP + E
2567   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2568 
2569   // RCP_S_E = RCP - E
2570   auto RCP_S_E = B.buildSub(S32, RCP, E);
2571 
2572   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2573   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2574 
2575   // Quotient = mulhu(Tmp0, Num)stmp
2576   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2577 
2578   // Num_S_Remainder = Quotient * Den
2579   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2580 
2581   // Remainder = Num - Num_S_Remainder
2582   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2583 
2584   // Remainder_GE_Den = Remainder >= Den
2585   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2586 
2587   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2588   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2589                                        Num, Num_S_Remainder);
2590 
2591   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2592   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2593 
2594   // Calculate Division result:
2595 
2596   // Quotient_A_One = Quotient + 1
2597   auto One = B.buildConstant(S32, 1);
2598   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2599 
2600   // Quotient_S_One = Quotient - 1
2601   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2602 
2603   // Div = (Tmp1 ? Quotient_A_One : Quotient)
2604   auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient);
2605 
2606   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2607   if (IsDiv) {
2608     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2609   } else {
2610     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2611 
2612     // Calculate Rem result:
2613     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2614 
2615     // Remainder_A_Den = Remainder + Den
2616     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2617 
2618     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2619     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2620 
2621     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2622     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2623   }
2624 }
2625 
2626 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2627                                               MachineRegisterInfo &MRI,
2628                                               MachineIRBuilder &B) const {
2629   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2630   Register DstReg = MI.getOperand(0).getReg();
2631   Register Num = MI.getOperand(1).getReg();
2632   Register Den = MI.getOperand(2).getReg();
2633   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2634   MI.eraseFromParent();
2635   return true;
2636 }
2637 
2638 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2639 //
2640 // Return lo, hi of result
2641 //
2642 // %cvt.lo = G_UITOFP Val.lo
2643 // %cvt.hi = G_UITOFP Val.hi
2644 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2645 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2646 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2647 // %mul2 = G_FMUL %mul1, 2**(-32)
2648 // %trunc = G_INTRINSIC_TRUNC %mul2
2649 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2650 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2651 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2652                                                        Register Val) {
2653   const LLT S32 = LLT::scalar(32);
2654   auto Unmerge = B.buildUnmerge(S32, Val);
2655 
2656   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2657   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2658 
2659   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2660                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2661 
2662   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2663   auto Mul1 =
2664       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2665 
2666   // 2**(-32)
2667   auto Mul2 =
2668       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2669   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2670 
2671   // -(2**32)
2672   auto Mad2 = B.buildFMAD(S32, Trunc,
2673                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2674 
2675   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2676   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2677 
2678   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2679 }
2680 
2681 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2682                                                   Register DstReg,
2683                                                   Register Numer,
2684                                                   Register Denom,
2685                                                   bool IsDiv) const {
2686   const LLT S32 = LLT::scalar(32);
2687   const LLT S64 = LLT::scalar(64);
2688   const LLT S1 = LLT::scalar(1);
2689   Register RcpLo, RcpHi;
2690 
2691   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2692 
2693   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2694 
2695   auto Zero64 = B.buildConstant(S64, 0);
2696   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2697 
2698   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2699   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2700 
2701   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2702   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2703   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2704 
2705   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2706   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2707   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2708   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2709 
2710   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2711   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2712   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2713   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2714   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2715 
2716   auto Zero32 = B.buildConstant(S32, 0);
2717   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2718   auto Add2_HiC =
2719       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2720   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2721   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2722 
2723   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2724   Register NumerLo = UnmergeNumer.getReg(0);
2725   Register NumerHi = UnmergeNumer.getReg(1);
2726 
2727   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2728   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2729   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2730   Register Mul3_Lo = UnmergeMul3.getReg(0);
2731   Register Mul3_Hi = UnmergeMul3.getReg(1);
2732   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2733   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2734   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2735   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2736 
2737   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2738   Register DenomLo = UnmergeDenom.getReg(0);
2739   Register DenomHi = UnmergeDenom.getReg(1);
2740 
2741   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2742   auto C1 = B.buildSExt(S32, CmpHi);
2743 
2744   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2745   auto C2 = B.buildSExt(S32, CmpLo);
2746 
2747   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2748   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2749 
2750   // TODO: Here and below portions of the code can be enclosed into if/endif.
2751   // Currently control flow is unconditional and we have 4 selects after
2752   // potential endif to substitute PHIs.
2753 
2754   // if C3 != 0 ...
2755   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2756   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2757   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2758   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2759 
2760   auto One64 = B.buildConstant(S64, 1);
2761   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2762 
2763   auto C4 =
2764       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2765   auto C5 =
2766       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2767   auto C6 = B.buildSelect(
2768       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2769 
2770   // if (C6 != 0)
2771   auto Add4 = B.buildAdd(S64, Add3, One64);
2772   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2773 
2774   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2775   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2776   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2777 
2778   // endif C6
2779   // endif C3
2780 
2781   if (IsDiv) {
2782     auto Sel1 = B.buildSelect(
2783         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2784     B.buildSelect(DstReg,
2785                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2786   } else {
2787     auto Sel2 = B.buildSelect(
2788         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2789     B.buildSelect(DstReg,
2790                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2791   }
2792 }
2793 
2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2795                                             MachineRegisterInfo &MRI,
2796                                             MachineIRBuilder &B) const {
2797   const LLT S64 = LLT::scalar(64);
2798   const LLT S32 = LLT::scalar(32);
2799   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2800   Register DstReg = MI.getOperand(0).getReg();
2801   Register Num = MI.getOperand(1).getReg();
2802   Register Den = MI.getOperand(2).getReg();
2803   LLT Ty = MRI.getType(DstReg);
2804 
2805   if (Ty == S32)
2806     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2807   else if (Ty == S64)
2808     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2809   else
2810     return false;
2811 
2812   MI.eraseFromParent();
2813   return true;
2814 
2815 }
2816 
2817 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2818                                             MachineRegisterInfo &MRI,
2819                                             MachineIRBuilder &B) const {
2820   const LLT S64 = LLT::scalar(64);
2821   const LLT S32 = LLT::scalar(32);
2822 
2823   Register DstReg = MI.getOperand(0).getReg();
2824   const LLT Ty = MRI.getType(DstReg);
2825   if (Ty != S32 && Ty != S64)
2826     return false;
2827 
2828   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2829 
2830   Register LHS = MI.getOperand(1).getReg();
2831   Register RHS = MI.getOperand(2).getReg();
2832 
2833   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2834   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2835   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2836 
2837   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2838   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2839 
2840   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2841   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2842 
2843   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2844   if (Ty == S32)
2845     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2846   else
2847     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2848 
2849   Register Sign;
2850   if (IsDiv)
2851     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2852   else
2853     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2854 
2855   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2856   B.buildSub(DstReg, UDivRem, Sign);
2857 
2858   MI.eraseFromParent();
2859   return true;
2860 }
2861 
2862 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2863                                                  MachineRegisterInfo &MRI,
2864                                                  MachineIRBuilder &B) const {
2865   Register Res = MI.getOperand(0).getReg();
2866   Register LHS = MI.getOperand(1).getReg();
2867   Register RHS = MI.getOperand(2).getReg();
2868 
2869   uint16_t Flags = MI.getFlags();
2870 
2871   LLT ResTy = MRI.getType(Res);
2872   LLT S32 = LLT::scalar(32);
2873   LLT S64 = LLT::scalar(64);
2874 
2875   const MachineFunction &MF = B.getMF();
2876   bool Unsafe =
2877     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2878 
2879   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2880     return false;
2881 
2882   if (!Unsafe && ResTy == S32 &&
2883       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2884     return false;
2885 
2886   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2887     // 1 / x -> RCP(x)
2888     if (CLHS->isExactlyValue(1.0)) {
2889       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2890         .addUse(RHS)
2891         .setMIFlags(Flags);
2892 
2893       MI.eraseFromParent();
2894       return true;
2895     }
2896 
2897     // -1 / x -> RCP( FNEG(x) )
2898     if (CLHS->isExactlyValue(-1.0)) {
2899       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2900       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2901         .addUse(FNeg.getReg(0))
2902         .setMIFlags(Flags);
2903 
2904       MI.eraseFromParent();
2905       return true;
2906     }
2907   }
2908 
2909   // x / y -> x * (1.0 / y)
2910   if (Unsafe) {
2911     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2912       .addUse(RHS)
2913       .setMIFlags(Flags);
2914     B.buildFMul(Res, LHS, RCP, Flags);
2915 
2916     MI.eraseFromParent();
2917     return true;
2918   }
2919 
2920   return false;
2921 }
2922 
2923 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2924                                          MachineRegisterInfo &MRI,
2925                                          MachineIRBuilder &B) const {
2926   Register Res = MI.getOperand(0).getReg();
2927   Register LHS = MI.getOperand(1).getReg();
2928   Register RHS = MI.getOperand(2).getReg();
2929 
2930   uint16_t Flags = MI.getFlags();
2931 
2932   LLT S16 = LLT::scalar(16);
2933   LLT S32 = LLT::scalar(32);
2934 
2935   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2936   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2937 
2938   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2939     .addUse(RHSExt.getReg(0))
2940     .setMIFlags(Flags);
2941 
2942   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2943   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2944 
2945   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2946     .addUse(RDst.getReg(0))
2947     .addUse(RHS)
2948     .addUse(LHS)
2949     .setMIFlags(Flags);
2950 
2951   MI.eraseFromParent();
2952   return true;
2953 }
2954 
2955 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2956 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2957 static void toggleSPDenormMode(bool Enable,
2958                                MachineIRBuilder &B,
2959                                const GCNSubtarget &ST,
2960                                AMDGPU::SIModeRegisterDefaults Mode) {
2961   // Set SP denorm mode to this value.
2962   unsigned SPDenormMode =
2963     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2964 
2965   if (ST.hasDenormModeInst()) {
2966     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2967     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2968 
2969     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2970     B.buildInstr(AMDGPU::S_DENORM_MODE)
2971       .addImm(NewDenormModeValue);
2972 
2973   } else {
2974     // Select FP32 bit field in mode register.
2975     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2976                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2977                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2978 
2979     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2980       .addImm(SPDenormMode)
2981       .addImm(SPDenormModeBitField);
2982   }
2983 }
2984 
2985 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2986                                          MachineRegisterInfo &MRI,
2987                                          MachineIRBuilder &B) const {
2988   Register Res = MI.getOperand(0).getReg();
2989   Register LHS = MI.getOperand(1).getReg();
2990   Register RHS = MI.getOperand(2).getReg();
2991   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2992   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2993 
2994   uint16_t Flags = MI.getFlags();
2995 
2996   LLT S32 = LLT::scalar(32);
2997   LLT S1 = LLT::scalar(1);
2998 
2999   auto One = B.buildFConstant(S32, 1.0f);
3000 
3001   auto DenominatorScaled =
3002     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3003       .addUse(LHS)
3004       .addUse(RHS)
3005       .addImm(0)
3006       .setMIFlags(Flags);
3007   auto NumeratorScaled =
3008     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3009       .addUse(LHS)
3010       .addUse(RHS)
3011       .addImm(1)
3012       .setMIFlags(Flags);
3013 
3014   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3015     .addUse(DenominatorScaled.getReg(0))
3016     .setMIFlags(Flags);
3017   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3018 
3019   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3020   // aren't modeled as reading it.
3021   if (!Mode.allFP32Denormals())
3022     toggleSPDenormMode(true, B, ST, Mode);
3023 
3024   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3025   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3026   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3027   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3028   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3029   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3030 
3031   if (!Mode.allFP32Denormals())
3032     toggleSPDenormMode(false, B, ST, Mode);
3033 
3034   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3035     .addUse(Fma4.getReg(0))
3036     .addUse(Fma1.getReg(0))
3037     .addUse(Fma3.getReg(0))
3038     .addUse(NumeratorScaled.getReg(1))
3039     .setMIFlags(Flags);
3040 
3041   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3042     .addUse(Fmas.getReg(0))
3043     .addUse(RHS)
3044     .addUse(LHS)
3045     .setMIFlags(Flags);
3046 
3047   MI.eraseFromParent();
3048   return true;
3049 }
3050 
3051 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3052                                          MachineRegisterInfo &MRI,
3053                                          MachineIRBuilder &B) const {
3054   Register Res = MI.getOperand(0).getReg();
3055   Register LHS = MI.getOperand(1).getReg();
3056   Register RHS = MI.getOperand(2).getReg();
3057 
3058   uint16_t Flags = MI.getFlags();
3059 
3060   LLT S64 = LLT::scalar(64);
3061   LLT S1 = LLT::scalar(1);
3062 
3063   auto One = B.buildFConstant(S64, 1.0);
3064 
3065   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3066     .addUse(LHS)
3067     .addUse(RHS)
3068     .addImm(0)
3069     .setMIFlags(Flags);
3070 
3071   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3072 
3073   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3074     .addUse(DivScale0.getReg(0))
3075     .setMIFlags(Flags);
3076 
3077   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3078   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3079   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3080 
3081   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3082     .addUse(LHS)
3083     .addUse(RHS)
3084     .addImm(1)
3085     .setMIFlags(Flags);
3086 
3087   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3088   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3089   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3090 
3091   Register Scale;
3092   if (!ST.hasUsableDivScaleConditionOutput()) {
3093     // Workaround a hardware bug on SI where the condition output from div_scale
3094     // is not usable.
3095 
3096     LLT S32 = LLT::scalar(32);
3097 
3098     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3099     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3100     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3101     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3102 
3103     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3104                               Scale1Unmerge.getReg(1));
3105     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3106                               Scale0Unmerge.getReg(1));
3107     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3108   } else {
3109     Scale = DivScale1.getReg(1);
3110   }
3111 
3112   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3113     .addUse(Fma4.getReg(0))
3114     .addUse(Fma3.getReg(0))
3115     .addUse(Mul.getReg(0))
3116     .addUse(Scale)
3117     .setMIFlags(Flags);
3118 
3119   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3120     .addUse(Fmas.getReg(0))
3121     .addUse(RHS)
3122     .addUse(LHS)
3123     .setMIFlags(Flags);
3124 
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3130                                                  MachineRegisterInfo &MRI,
3131                                                  MachineIRBuilder &B) const {
3132   Register Res = MI.getOperand(0).getReg();
3133   Register LHS = MI.getOperand(2).getReg();
3134   Register RHS = MI.getOperand(3).getReg();
3135   uint16_t Flags = MI.getFlags();
3136 
3137   LLT S32 = LLT::scalar(32);
3138   LLT S1 = LLT::scalar(1);
3139 
3140   auto Abs = B.buildFAbs(S32, RHS, Flags);
3141   const APFloat C0Val(1.0f);
3142 
3143   auto C0 = B.buildConstant(S32, 0x6f800000);
3144   auto C1 = B.buildConstant(S32, 0x2f800000);
3145   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3146 
3147   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3148   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3149 
3150   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3151 
3152   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3153     .addUse(Mul0.getReg(0))
3154     .setMIFlags(Flags);
3155 
3156   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3157 
3158   B.buildFMul(Res, Sel, Mul1, Flags);
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3165                                                  MachineRegisterInfo &MRI,
3166                                                  MachineIRBuilder &B) const {
3167   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3168   if (!MFI->isEntryFunction()) {
3169     return legalizePreloadedArgIntrin(MI, MRI, B,
3170                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3171   }
3172 
3173   uint64_t Offset =
3174     ST.getTargetLowering()->getImplicitParameterOffset(
3175       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3176   Register DstReg = MI.getOperand(0).getReg();
3177   LLT DstTy = MRI.getType(DstReg);
3178   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3179 
3180   const ArgDescriptor *Arg;
3181   const TargetRegisterClass *RC;
3182   LLT ArgTy;
3183   std::tie(Arg, RC, ArgTy) =
3184       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3185   if (!Arg)
3186     return false;
3187 
3188   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3189   if (!loadInputValue(KernargPtrReg, B, Arg))
3190     return false;
3191 
3192   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3193   MI.eraseFromParent();
3194   return true;
3195 }
3196 
3197 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3198                                               MachineRegisterInfo &MRI,
3199                                               MachineIRBuilder &B,
3200                                               unsigned AddrSpace) const {
3201   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3202   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3203   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3204   MI.eraseFromParent();
3205   return true;
3206 }
3207 
3208 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3209 // offset (the offset that is included in bounds checking and swizzling, to be
3210 // split between the instruction's voffset and immoffset fields) and soffset
3211 // (the offset that is excluded from bounds checking and swizzling, to go in
3212 // the instruction's soffset field).  This function takes the first kind of
3213 // offset and figures out how to split it between voffset and immoffset.
3214 std::tuple<Register, unsigned, unsigned>
3215 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3216                                         Register OrigOffset) const {
3217   const unsigned MaxImm = 4095;
3218   Register BaseReg;
3219   unsigned TotalConstOffset;
3220   MachineInstr *OffsetDef;
3221   const LLT S32 = LLT::scalar(32);
3222 
3223   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3224     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3225 
3226   unsigned ImmOffset = TotalConstOffset;
3227 
3228   // If the immediate value is too big for the immoffset field, put the value
3229   // and -4096 into the immoffset field so that the value that is copied/added
3230   // for the voffset field is a multiple of 4096, and it stands more chance
3231   // of being CSEd with the copy/add for another similar load/store.
3232   // However, do not do that rounding down to a multiple of 4096 if that is a
3233   // negative number, as it appears to be illegal to have a negative offset
3234   // in the vgpr, even if adding the immediate offset makes it positive.
3235   unsigned Overflow = ImmOffset & ~MaxImm;
3236   ImmOffset -= Overflow;
3237   if ((int32_t)Overflow < 0) {
3238     Overflow += ImmOffset;
3239     ImmOffset = 0;
3240   }
3241 
3242   if (Overflow != 0) {
3243     if (!BaseReg) {
3244       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3245     } else {
3246       auto OverflowVal = B.buildConstant(S32, Overflow);
3247       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3248     }
3249   }
3250 
3251   if (!BaseReg)
3252     BaseReg = B.buildConstant(S32, 0).getReg(0);
3253 
3254   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3255 }
3256 
3257 /// Handle register layout difference for f16 images for some subtargets.
3258 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3259                                              MachineRegisterInfo &MRI,
3260                                              Register Reg) const {
3261   if (!ST.hasUnpackedD16VMem())
3262     return Reg;
3263 
3264   const LLT S16 = LLT::scalar(16);
3265   const LLT S32 = LLT::scalar(32);
3266   LLT StoreVT = MRI.getType(Reg);
3267   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3268 
3269   auto Unmerge = B.buildUnmerge(S16, Reg);
3270 
3271   SmallVector<Register, 4> WideRegs;
3272   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3273     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3274 
3275   int NumElts = StoreVT.getNumElements();
3276 
3277   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3278 }
3279 
3280 Register AMDGPULegalizerInfo::fixStoreSourceType(
3281   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3282   MachineRegisterInfo *MRI = B.getMRI();
3283   LLT Ty = MRI->getType(VData);
3284 
3285   const LLT S16 = LLT::scalar(16);
3286 
3287   // Fixup illegal register types for i8 stores.
3288   if (Ty == LLT::scalar(8) || Ty == S16) {
3289     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3290     return AnyExt;
3291   }
3292 
3293   if (Ty.isVector()) {
3294     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3295       if (IsFormat)
3296         return handleD16VData(B, *MRI, VData);
3297     }
3298   }
3299 
3300   return VData;
3301 }
3302 
3303 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3304                                               MachineRegisterInfo &MRI,
3305                                               MachineIRBuilder &B,
3306                                               bool IsTyped,
3307                                               bool IsFormat) const {
3308   Register VData = MI.getOperand(1).getReg();
3309   LLT Ty = MRI.getType(VData);
3310   LLT EltTy = Ty.getScalarType();
3311   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3312   const LLT S32 = LLT::scalar(32);
3313 
3314   VData = fixStoreSourceType(B, VData, IsFormat);
3315   Register RSrc = MI.getOperand(2).getReg();
3316 
3317   MachineMemOperand *MMO = *MI.memoperands_begin();
3318   const int MemSize = MMO->getSize();
3319 
3320   unsigned ImmOffset;
3321   unsigned TotalOffset;
3322 
3323   // The typed intrinsics add an immediate after the registers.
3324   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3325 
3326   // The struct intrinsic variants add one additional operand over raw.
3327   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3328   Register VIndex;
3329   int OpOffset = 0;
3330   if (HasVIndex) {
3331     VIndex = MI.getOperand(3).getReg();
3332     OpOffset = 1;
3333   }
3334 
3335   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3336   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3337 
3338   unsigned Format = 0;
3339   if (IsTyped) {
3340     Format = MI.getOperand(5 + OpOffset).getImm();
3341     ++OpOffset;
3342   }
3343 
3344   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3345 
3346   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3347   if (TotalOffset != 0)
3348     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3349 
3350   unsigned Opc;
3351   if (IsTyped) {
3352     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3353                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3354   } else if (IsFormat) {
3355     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3356                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3357   } else {
3358     switch (MemSize) {
3359     case 1:
3360       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3361       break;
3362     case 2:
3363       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3364       break;
3365     default:
3366       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3367       break;
3368     }
3369   }
3370 
3371   if (!VIndex)
3372     VIndex = B.buildConstant(S32, 0).getReg(0);
3373 
3374   auto MIB = B.buildInstr(Opc)
3375     .addUse(VData)              // vdata
3376     .addUse(RSrc)               // rsrc
3377     .addUse(VIndex)             // vindex
3378     .addUse(VOffset)            // voffset
3379     .addUse(SOffset)            // soffset
3380     .addImm(ImmOffset);         // offset(imm)
3381 
3382   if (IsTyped)
3383     MIB.addImm(Format);
3384 
3385   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3386      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3387      .addMemOperand(MMO);
3388 
3389   MI.eraseFromParent();
3390   return true;
3391 }
3392 
3393 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3394                                              MachineRegisterInfo &MRI,
3395                                              MachineIRBuilder &B,
3396                                              bool IsFormat,
3397                                              bool IsTyped) const {
3398   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3399   MachineMemOperand *MMO = *MI.memoperands_begin();
3400   const int MemSize = MMO->getSize();
3401   const LLT S32 = LLT::scalar(32);
3402 
3403   Register Dst = MI.getOperand(0).getReg();
3404   Register RSrc = MI.getOperand(2).getReg();
3405 
3406   // The typed intrinsics add an immediate after the registers.
3407   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3408 
3409   // The struct intrinsic variants add one additional operand over raw.
3410   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3411   Register VIndex;
3412   int OpOffset = 0;
3413   if (HasVIndex) {
3414     VIndex = MI.getOperand(3).getReg();
3415     OpOffset = 1;
3416   }
3417 
3418   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3419   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3420 
3421   unsigned Format = 0;
3422   if (IsTyped) {
3423     Format = MI.getOperand(5 + OpOffset).getImm();
3424     ++OpOffset;
3425   }
3426 
3427   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3428   unsigned ImmOffset;
3429   unsigned TotalOffset;
3430 
3431   LLT Ty = MRI.getType(Dst);
3432   LLT EltTy = Ty.getScalarType();
3433   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3434   const bool Unpacked = ST.hasUnpackedD16VMem();
3435 
3436   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3437   if (TotalOffset != 0)
3438     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3439 
3440   unsigned Opc;
3441 
3442   if (IsTyped) {
3443     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3444                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3445   } else if (IsFormat) {
3446     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3447                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3448   } else {
3449     switch (MemSize) {
3450     case 1:
3451       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3452       break;
3453     case 2:
3454       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3455       break;
3456     default:
3457       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3458       break;
3459     }
3460   }
3461 
3462   Register LoadDstReg;
3463 
3464   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3465   LLT UnpackedTy = Ty.changeElementSize(32);
3466 
3467   if (IsExtLoad)
3468     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3469   else if (Unpacked && IsD16 && Ty.isVector())
3470     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3471   else
3472     LoadDstReg = Dst;
3473 
3474   if (!VIndex)
3475     VIndex = B.buildConstant(S32, 0).getReg(0);
3476 
3477   auto MIB = B.buildInstr(Opc)
3478     .addDef(LoadDstReg)         // vdata
3479     .addUse(RSrc)               // rsrc
3480     .addUse(VIndex)             // vindex
3481     .addUse(VOffset)            // voffset
3482     .addUse(SOffset)            // soffset
3483     .addImm(ImmOffset);         // offset(imm)
3484 
3485   if (IsTyped)
3486     MIB.addImm(Format);
3487 
3488   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3489      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3490      .addMemOperand(MMO);
3491 
3492   if (LoadDstReg != Dst) {
3493     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3494 
3495     // Widen result for extending loads was widened.
3496     if (IsExtLoad)
3497       B.buildTrunc(Dst, LoadDstReg);
3498     else {
3499       // Repack to original 16-bit vector result
3500       // FIXME: G_TRUNC should work, but legalization currently fails
3501       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3502       SmallVector<Register, 4> Repack;
3503       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3504         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3505       B.buildMerge(Dst, Repack);
3506     }
3507   }
3508 
3509   MI.eraseFromParent();
3510   return true;
3511 }
3512 
3513 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3514                                                MachineIRBuilder &B,
3515                                                bool IsInc) const {
3516   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3517                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3518   B.buildInstr(Opc)
3519     .addDef(MI.getOperand(0).getReg())
3520     .addUse(MI.getOperand(2).getReg())
3521     .addUse(MI.getOperand(3).getReg())
3522     .cloneMemRefs(MI);
3523   MI.eraseFromParent();
3524   return true;
3525 }
3526 
3527 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3528   switch (IntrID) {
3529   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3530   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3531     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3532   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3533   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3534     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3535   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3536   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3537     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3538   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3539   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3540     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3541   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3542   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3543     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3544   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3545   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3546     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3547   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3548   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3549     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3550   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3551   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3552     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3553   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3554   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3555     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3556   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3557   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3558     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3559   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3560   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3561     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3562   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3563   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3564     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3565   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3566   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3567     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3568   default:
3569     llvm_unreachable("unhandled atomic opcode");
3570   }
3571 }
3572 
3573 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3574                                                MachineIRBuilder &B,
3575                                                Intrinsic::ID IID) const {
3576   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3577                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3578 
3579   Register Dst = MI.getOperand(0).getReg();
3580   Register VData = MI.getOperand(2).getReg();
3581 
3582   Register CmpVal;
3583   int OpOffset = 0;
3584 
3585   if (IsCmpSwap) {
3586     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3587     ++OpOffset;
3588   }
3589 
3590   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3591   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3592 
3593   // The struct intrinsic variants add one additional operand over raw.
3594   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3595   Register VIndex;
3596   if (HasVIndex) {
3597     VIndex = MI.getOperand(4 + OpOffset).getReg();
3598     ++OpOffset;
3599   }
3600 
3601   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3602   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3603   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3604 
3605   MachineMemOperand *MMO = *MI.memoperands_begin();
3606 
3607   unsigned ImmOffset;
3608   unsigned TotalOffset;
3609   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3610   if (TotalOffset != 0)
3611     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3612 
3613   if (!VIndex)
3614     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3615 
3616   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3617     .addDef(Dst)
3618     .addUse(VData); // vdata
3619 
3620   if (IsCmpSwap)
3621     MIB.addReg(CmpVal);
3622 
3623   MIB.addUse(RSrc)               // rsrc
3624      .addUse(VIndex)             // vindex
3625      .addUse(VOffset)            // voffset
3626      .addUse(SOffset)            // soffset
3627      .addImm(ImmOffset)          // offset(imm)
3628      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3629      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3630      .addMemOperand(MMO);
3631 
3632   MI.eraseFromParent();
3633   return true;
3634 }
3635 
3636 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3637 /// vector with s16 typed elements.
3638 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3639                                         SmallVectorImpl<Register> &PackedAddrs,
3640                                         int AddrIdx, int DimIdx, int EndIdx,
3641                                         int NumGradients) {
3642   const LLT S16 = LLT::scalar(16);
3643   const LLT V2S16 = LLT::vector(2, 16);
3644 
3645   for (int I = AddrIdx; I < EndIdx; ++I) {
3646     MachineOperand &SrcOp = MI.getOperand(I);
3647     if (!SrcOp.isReg())
3648       continue; // _L to _LZ may have eliminated this.
3649 
3650     Register AddrReg = SrcOp.getReg();
3651 
3652     if (I < DimIdx) {
3653       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3654       PackedAddrs.push_back(AddrReg);
3655     } else {
3656       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3657       // derivatives dx/dh and dx/dv are packed with undef.
3658       if (((I + 1) >= EndIdx) ||
3659           ((NumGradients / 2) % 2 == 1 &&
3660            (I == DimIdx + (NumGradients / 2) - 1 ||
3661             I == DimIdx + NumGradients - 1)) ||
3662           // Check for _L to _LZ optimization
3663           !MI.getOperand(I + 1).isReg()) {
3664         PackedAddrs.push_back(
3665             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3666                 .getReg(0));
3667       } else {
3668         PackedAddrs.push_back(
3669             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3670                 .getReg(0));
3671         ++I;
3672       }
3673     }
3674   }
3675 }
3676 
3677 /// Convert from separate vaddr components to a single vector address register,
3678 /// and replace the remaining operands with $noreg.
3679 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3680                                      int DimIdx, int NumVAddrs) {
3681   const LLT S32 = LLT::scalar(32);
3682 
3683   SmallVector<Register, 8> AddrRegs;
3684   for (int I = 0; I != NumVAddrs; ++I) {
3685     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3686     if (SrcOp.isReg()) {
3687       AddrRegs.push_back(SrcOp.getReg());
3688       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3689     }
3690   }
3691 
3692   int NumAddrRegs = AddrRegs.size();
3693   if (NumAddrRegs != 1) {
3694     // Round up to 8 elements for v5-v7
3695     // FIXME: Missing intermediate sized register classes and instructions.
3696     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3697       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3698       auto Undef = B.buildUndef(S32);
3699       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3700       NumAddrRegs = RoundedNumRegs;
3701     }
3702 
3703     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3704     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3705   }
3706 
3707   for (int I = 1; I != NumVAddrs; ++I) {
3708     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3709     if (SrcOp.isReg())
3710       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3711   }
3712 }
3713 
3714 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3715 ///
3716 /// Depending on the subtarget, load/store with 16-bit element data need to be
3717 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3718 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3719 /// registers.
3720 ///
3721 /// We don't want to directly select image instructions just yet, but also want
3722 /// to exposes all register repacking to the legalizer/combiners. We also don't
3723 /// want a selected instrution entering RegBankSelect. In order to avoid
3724 /// defining a multitude of intermediate image instructions, directly hack on
3725 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3726 /// now unnecessary arguments with $noreg.
3727 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3728     MachineInstr &MI, MachineIRBuilder &B,
3729     GISelChangeObserver &Observer,
3730     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3731 
3732   const int NumDefs = MI.getNumExplicitDefs();
3733   bool IsTFE = NumDefs == 2;
3734   // We are only processing the operands of d16 image operations on subtargets
3735   // that use the unpacked register layout, or need to repack the TFE result.
3736 
3737   // TODO: Do we need to guard against already legalized intrinsics?
3738   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3739     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3740 
3741   MachineRegisterInfo *MRI = B.getMRI();
3742   const LLT S32 = LLT::scalar(32);
3743   const LLT S16 = LLT::scalar(16);
3744   const LLT V2S16 = LLT::vector(2, 16);
3745 
3746   // Index of first address argument
3747   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3748 
3749   int NumVAddrs, NumGradients;
3750   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3751   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3752     getDMaskIdx(BaseOpcode, NumDefs);
3753   unsigned DMask = 0;
3754 
3755   // Check for 16 bit addresses and pack if true.
3756   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3757   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3758   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3759   const bool IsG16 = GradTy == S16;
3760   const bool IsA16 = AddrTy == S16;
3761 
3762   int DMaskLanes = 0;
3763   if (!BaseOpcode->Atomic) {
3764     DMask = MI.getOperand(DMaskIdx).getImm();
3765     if (BaseOpcode->Gather4) {
3766       DMaskLanes = 4;
3767     } else if (DMask != 0) {
3768       DMaskLanes = countPopulation(DMask);
3769     } else if (!IsTFE && !BaseOpcode->Store) {
3770       // If dmask is 0, this is a no-op load. This can be eliminated.
3771       B.buildUndef(MI.getOperand(0));
3772       MI.eraseFromParent();
3773       return true;
3774     }
3775   }
3776 
3777   Observer.changingInstr(MI);
3778   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3779 
3780   unsigned NewOpcode = NumDefs == 0 ?
3781     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3782 
3783   // Track that we legalized this
3784   MI.setDesc(B.getTII().get(NewOpcode));
3785 
3786   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3787   // dmask to be at least 1 otherwise the instruction will fail
3788   if (IsTFE && DMask == 0) {
3789     DMask = 0x1;
3790     DMaskLanes = 1;
3791     MI.getOperand(DMaskIdx).setImm(DMask);
3792   }
3793 
3794   if (BaseOpcode->Atomic) {
3795     Register VData0 = MI.getOperand(2).getReg();
3796     LLT Ty = MRI->getType(VData0);
3797 
3798     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3799     if (Ty.isVector())
3800       return false;
3801 
3802     if (BaseOpcode->AtomicX2) {
3803       Register VData1 = MI.getOperand(3).getReg();
3804       // The two values are packed in one register.
3805       LLT PackedTy = LLT::vector(2, Ty);
3806       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3807       MI.getOperand(2).setReg(Concat.getReg(0));
3808       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3809     }
3810   }
3811 
3812   int CorrectedNumVAddrs = NumVAddrs;
3813 
3814   // Optimize _L to _LZ when _L is zero
3815   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3816         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3817     const ConstantFP *ConstantLod;
3818     const int LodIdx = AddrIdx + NumVAddrs - 1;
3819 
3820     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3821       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3822         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3823         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3824           LZMappingInfo->LZ, ImageDimIntr->Dim);
3825 
3826         // The starting indexes should remain in the same place.
3827         --NumVAddrs;
3828         --CorrectedNumVAddrs;
3829 
3830         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3831           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3832         MI.RemoveOperand(LodIdx);
3833       }
3834     }
3835   }
3836 
3837   // Optimize _mip away, when 'lod' is zero
3838   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3839     int64_t ConstantLod;
3840     const int LodIdx = AddrIdx + NumVAddrs - 1;
3841 
3842     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3843       if (ConstantLod == 0) {
3844         // TODO: Change intrinsic opcode and remove operand instead or replacing
3845         // it with 0, as the _L to _LZ handling is done above.
3846         MI.getOperand(LodIdx).ChangeToImmediate(0);
3847         --CorrectedNumVAddrs;
3848       }
3849     }
3850   }
3851 
3852   // Rewrite the addressing register layout before doing anything else.
3853   if (IsA16 || IsG16) {
3854     if (IsA16) {
3855       // Target must support the feature and gradients need to be 16 bit too
3856       if (!ST.hasA16() || !IsG16)
3857         return false;
3858     } else if (!ST.hasG16())
3859       return false;
3860 
3861     if (NumVAddrs > 1) {
3862       SmallVector<Register, 4> PackedRegs;
3863       // Don't compress addresses for G16
3864       const int PackEndIdx =
3865           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3866       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3867                                   PackEndIdx, NumGradients);
3868 
3869       if (!IsA16) {
3870         // Add uncompressed address
3871         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3872           int AddrReg = MI.getOperand(I).getReg();
3873           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3874           PackedRegs.push_back(AddrReg);
3875         }
3876       }
3877 
3878       // See also below in the non-a16 branch
3879       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3880 
3881       if (!UseNSA && PackedRegs.size() > 1) {
3882         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3883         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3884         PackedRegs[0] = Concat.getReg(0);
3885         PackedRegs.resize(1);
3886       }
3887 
3888       const int NumPacked = PackedRegs.size();
3889       for (int I = 0; I != NumVAddrs; ++I) {
3890         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3891         if (!SrcOp.isReg()) {
3892           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3893           continue;
3894         }
3895 
3896         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3897 
3898         if (I < NumPacked)
3899           SrcOp.setReg(PackedRegs[I]);
3900         else
3901           SrcOp.setReg(AMDGPU::NoRegister);
3902       }
3903     }
3904   } else {
3905     // If the register allocator cannot place the address registers contiguously
3906     // without introducing moves, then using the non-sequential address encoding
3907     // is always preferable, since it saves VALU instructions and is usually a
3908     // wash in terms of code size or even better.
3909     //
3910     // However, we currently have no way of hinting to the register allocator
3911     // that MIMG addresses should be placed contiguously when it is possible to
3912     // do so, so force non-NSA for the common 2-address case as a heuristic.
3913     //
3914     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3915     // allocation when possible.
3916     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3917 
3918     if (!UseNSA && NumVAddrs > 1)
3919       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3920   }
3921 
3922   int Flags = 0;
3923   if (IsA16)
3924     Flags |= 1;
3925   if (IsG16)
3926     Flags |= 2;
3927   MI.addOperand(MachineOperand::CreateImm(Flags));
3928 
3929   if (BaseOpcode->Store) { // No TFE for stores?
3930     // TODO: Handle dmask trim
3931     Register VData = MI.getOperand(1).getReg();
3932     LLT Ty = MRI->getType(VData);
3933     if (!Ty.isVector() || Ty.getElementType() != S16)
3934       return true;
3935 
3936     Register RepackedReg = handleD16VData(B, *MRI, VData);
3937     if (RepackedReg != VData) {
3938       MI.getOperand(1).setReg(RepackedReg);
3939     }
3940 
3941     return true;
3942   }
3943 
3944   Register DstReg = MI.getOperand(0).getReg();
3945   LLT Ty = MRI->getType(DstReg);
3946   const LLT EltTy = Ty.getScalarType();
3947   const bool IsD16 = Ty.getScalarType() == S16;
3948   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3949 
3950   // Confirm that the return type is large enough for the dmask specified
3951   if (NumElts < DMaskLanes)
3952     return false;
3953 
3954   if (NumElts > 4 || DMaskLanes > 4)
3955     return false;
3956 
3957   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3958   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3959 
3960   // The raw dword aligned data component of the load. The only legal cases
3961   // where this matters should be when using the packed D16 format, for
3962   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3963   LLT RoundedTy;
3964 
3965   // S32 vector to to cover all data, plus TFE result element.
3966   LLT TFETy;
3967 
3968   // Register type to use for each loaded component. Will be S32 or V2S16.
3969   LLT RegTy;
3970 
3971   if (IsD16 && ST.hasUnpackedD16VMem()) {
3972     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3973     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3974     RegTy = S32;
3975   } else {
3976     unsigned EltSize = EltTy.getSizeInBits();
3977     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3978     unsigned RoundedSize = 32 * RoundedElts;
3979     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3980     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3981     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3982   }
3983 
3984   // The return type does not need adjustment.
3985   // TODO: Should we change s16 case to s32 or <2 x s16>?
3986   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3987     return true;
3988 
3989   Register Dst1Reg;
3990 
3991   // Insert after the instruction.
3992   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3993 
3994   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3995   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3996   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3997   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3998 
3999   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4000 
4001   MI.getOperand(0).setReg(NewResultReg);
4002 
4003   // In the IR, TFE is supposed to be used with a 2 element struct return
4004   // type. The intruction really returns these two values in one contiguous
4005   // register, with one additional dword beyond the loaded data. Rewrite the
4006   // return type to use a single register result.
4007 
4008   if (IsTFE) {
4009     Dst1Reg = MI.getOperand(1).getReg();
4010     if (MRI->getType(Dst1Reg) != S32)
4011       return false;
4012 
4013     // TODO: Make sure the TFE operand bit is set.
4014     MI.RemoveOperand(1);
4015 
4016     // Handle the easy case that requires no repack instructions.
4017     if (Ty == S32) {
4018       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4019       return true;
4020     }
4021   }
4022 
4023   // Now figure out how to copy the new result register back into the old
4024   // result.
4025   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4026 
4027   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4028 
4029   if (ResultNumRegs == 1) {
4030     assert(!IsTFE);
4031     ResultRegs[0] = NewResultReg;
4032   } else {
4033     // We have to repack into a new vector of some kind.
4034     for (int I = 0; I != NumDataRegs; ++I)
4035       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4036     B.buildUnmerge(ResultRegs, NewResultReg);
4037 
4038     // Drop the final TFE element to get the data part. The TFE result is
4039     // directly written to the right place already.
4040     if (IsTFE)
4041       ResultRegs.resize(NumDataRegs);
4042   }
4043 
4044   // For an s16 scalar result, we form an s32 result with a truncate regardless
4045   // of packed vs. unpacked.
4046   if (IsD16 && !Ty.isVector()) {
4047     B.buildTrunc(DstReg, ResultRegs[0]);
4048     return true;
4049   }
4050 
4051   // Avoid a build/concat_vector of 1 entry.
4052   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4053     B.buildBitcast(DstReg, ResultRegs[0]);
4054     return true;
4055   }
4056 
4057   assert(Ty.isVector());
4058 
4059   if (IsD16) {
4060     // For packed D16 results with TFE enabled, all the data components are
4061     // S32. Cast back to the expected type.
4062     //
4063     // TODO: We don't really need to use load s32 elements. We would only need one
4064     // cast for the TFE result if a multiple of v2s16 was used.
4065     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4066       for (Register &Reg : ResultRegs)
4067         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4068     } else if (ST.hasUnpackedD16VMem()) {
4069       for (Register &Reg : ResultRegs)
4070         Reg = B.buildTrunc(S16, Reg).getReg(0);
4071     }
4072   }
4073 
4074   auto padWithUndef = [&](LLT Ty, int NumElts) {
4075     if (NumElts == 0)
4076       return;
4077     Register Undef = B.buildUndef(Ty).getReg(0);
4078     for (int I = 0; I != NumElts; ++I)
4079       ResultRegs.push_back(Undef);
4080   };
4081 
4082   // Pad out any elements eliminated due to the dmask.
4083   LLT ResTy = MRI->getType(ResultRegs[0]);
4084   if (!ResTy.isVector()) {
4085     padWithUndef(ResTy, NumElts - ResultRegs.size());
4086     B.buildBuildVector(DstReg, ResultRegs);
4087     return true;
4088   }
4089 
4090   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4091   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4092 
4093   // Deal with the one annoying legal case.
4094   const LLT V3S16 = LLT::vector(3, 16);
4095   if (Ty == V3S16) {
4096     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4097     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4098     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4099     return true;
4100   }
4101 
4102   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4103   B.buildConcatVectors(DstReg, ResultRegs);
4104   return true;
4105 }
4106 
4107 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4108   MachineInstr &MI, MachineIRBuilder &B,
4109   GISelChangeObserver &Observer) const {
4110   Register Dst = MI.getOperand(0).getReg();
4111   LLT Ty = B.getMRI()->getType(Dst);
4112   unsigned Size = Ty.getSizeInBits();
4113   MachineFunction &MF = B.getMF();
4114 
4115   Observer.changingInstr(MI);
4116 
4117   // FIXME: We don't really need this intermediate instruction. The intrinsic
4118   // should be fixed to have a memory operand. Since it's readnone, we're not
4119   // allowed to add one.
4120   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4121   MI.RemoveOperand(1); // Remove intrinsic ID
4122 
4123   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4124   // TODO: Should this use datalayout alignment?
4125   const unsigned MemSize = (Size + 7) / 8;
4126   const Align MemAlign(4);
4127   MachineMemOperand *MMO = MF.getMachineMemOperand(
4128       MachinePointerInfo(),
4129       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4130           MachineMemOperand::MOInvariant,
4131       MemSize, MemAlign);
4132   MI.addMemOperand(MF, MMO);
4133 
4134   // There are no 96-bit result scalar loads, but widening to 128-bit should
4135   // always be legal. We may need to restore this to a 96-bit result if it turns
4136   // out this needs to be converted to a vector load during RegBankSelect.
4137   if (!isPowerOf2_32(Size)) {
4138     LegalizerHelper Helper(MF, *this, Observer, B);
4139 
4140     if (Ty.isVector())
4141       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4142     else
4143       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4144   }
4145 
4146   Observer.changedInstr(MI);
4147   return true;
4148 }
4149 
4150 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4151                                                 MachineRegisterInfo &MRI,
4152                                                 MachineIRBuilder &B) const {
4153   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4154   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4155       !ST.isTrapHandlerEnabled()) {
4156     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4157   } else {
4158     // Pass queue pointer to trap handler as input, and insert trap instruction
4159     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4160     const ArgDescriptor *Arg =
4161         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4162     if (!Arg)
4163       return false;
4164     MachineRegisterInfo &MRI = *B.getMRI();
4165     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4166     Register LiveIn = getLiveInRegister(
4167         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4168         /*InsertLiveInCopy=*/false);
4169     if (!loadInputValue(LiveIn, B, Arg))
4170       return false;
4171     B.buildCopy(SGPR01, LiveIn);
4172     B.buildInstr(AMDGPU::S_TRAP)
4173         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4174         .addReg(SGPR01, RegState::Implicit);
4175   }
4176 
4177   MI.eraseFromParent();
4178   return true;
4179 }
4180 
4181 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4182     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4183   // Is non-HSA path or trap-handler disabled? then, report a warning
4184   // accordingly
4185   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4186       !ST.isTrapHandlerEnabled()) {
4187     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4188                                      "debugtrap handler not supported",
4189                                      MI.getDebugLoc(), DS_Warning);
4190     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4191     Ctx.diagnose(NoTrap);
4192   } else {
4193     // Insert debug-trap instruction
4194     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4195   }
4196 
4197   MI.eraseFromParent();
4198   return true;
4199 }
4200 
4201 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4202                                             MachineInstr &MI) const {
4203   MachineIRBuilder &B = Helper.MIRBuilder;
4204   MachineRegisterInfo &MRI = *B.getMRI();
4205 
4206   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4207   auto IntrID = MI.getIntrinsicID();
4208   switch (IntrID) {
4209   case Intrinsic::amdgcn_if:
4210   case Intrinsic::amdgcn_else: {
4211     MachineInstr *Br = nullptr;
4212     MachineBasicBlock *UncondBrTarget = nullptr;
4213     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4214       const SIRegisterInfo *TRI
4215         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4216 
4217       Register Def = MI.getOperand(1).getReg();
4218       Register Use = MI.getOperand(3).getReg();
4219 
4220       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4221       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4222       if (IntrID == Intrinsic::amdgcn_if) {
4223         B.buildInstr(AMDGPU::SI_IF)
4224           .addDef(Def)
4225           .addUse(Use)
4226           .addMBB(UncondBrTarget);
4227       } else {
4228         B.buildInstr(AMDGPU::SI_ELSE)
4229           .addDef(Def)
4230           .addUse(Use)
4231           .addMBB(UncondBrTarget)
4232           .addImm(0);
4233       }
4234 
4235       if (Br) {
4236         Br->getOperand(0).setMBB(CondBrTarget);
4237       } else {
4238         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4239         // since we're swapping branch targets it needs to be reinserted.
4240         // FIXME: IRTranslator should probably not do this
4241         B.buildBr(*CondBrTarget);
4242       }
4243 
4244       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4245       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4246       MI.eraseFromParent();
4247       BrCond->eraseFromParent();
4248       return true;
4249     }
4250 
4251     return false;
4252   }
4253   case Intrinsic::amdgcn_loop: {
4254     MachineInstr *Br = nullptr;
4255     MachineBasicBlock *UncondBrTarget = nullptr;
4256     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4257       const SIRegisterInfo *TRI
4258         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4259 
4260       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4261       Register Reg = MI.getOperand(2).getReg();
4262 
4263       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4264       B.buildInstr(AMDGPU::SI_LOOP)
4265         .addUse(Reg)
4266         .addMBB(UncondBrTarget);
4267 
4268       if (Br)
4269         Br->getOperand(0).setMBB(CondBrTarget);
4270       else
4271         B.buildBr(*CondBrTarget);
4272 
4273       MI.eraseFromParent();
4274       BrCond->eraseFromParent();
4275       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4276       return true;
4277     }
4278 
4279     return false;
4280   }
4281   case Intrinsic::amdgcn_kernarg_segment_ptr:
4282     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4283       // This only makes sense to call in a kernel, so just lower to null.
4284       B.buildConstant(MI.getOperand(0).getReg(), 0);
4285       MI.eraseFromParent();
4286       return true;
4287     }
4288 
4289     return legalizePreloadedArgIntrin(
4290       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4291   case Intrinsic::amdgcn_implicitarg_ptr:
4292     return legalizeImplicitArgPtr(MI, MRI, B);
4293   case Intrinsic::amdgcn_workitem_id_x:
4294     return legalizePreloadedArgIntrin(MI, MRI, B,
4295                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4296   case Intrinsic::amdgcn_workitem_id_y:
4297     return legalizePreloadedArgIntrin(MI, MRI, B,
4298                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4299   case Intrinsic::amdgcn_workitem_id_z:
4300     return legalizePreloadedArgIntrin(MI, MRI, B,
4301                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4302   case Intrinsic::amdgcn_workgroup_id_x:
4303     return legalizePreloadedArgIntrin(MI, MRI, B,
4304                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4305   case Intrinsic::amdgcn_workgroup_id_y:
4306     return legalizePreloadedArgIntrin(MI, MRI, B,
4307                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4308   case Intrinsic::amdgcn_workgroup_id_z:
4309     return legalizePreloadedArgIntrin(MI, MRI, B,
4310                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4311   case Intrinsic::amdgcn_dispatch_ptr:
4312     return legalizePreloadedArgIntrin(MI, MRI, B,
4313                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4314   case Intrinsic::amdgcn_queue_ptr:
4315     return legalizePreloadedArgIntrin(MI, MRI, B,
4316                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4317   case Intrinsic::amdgcn_implicit_buffer_ptr:
4318     return legalizePreloadedArgIntrin(
4319       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4320   case Intrinsic::amdgcn_dispatch_id:
4321     return legalizePreloadedArgIntrin(MI, MRI, B,
4322                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4323   case Intrinsic::amdgcn_fdiv_fast:
4324     return legalizeFDIVFastIntrin(MI, MRI, B);
4325   case Intrinsic::amdgcn_is_shared:
4326     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4327   case Intrinsic::amdgcn_is_private:
4328     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4329   case Intrinsic::amdgcn_wavefrontsize: {
4330     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4331     MI.eraseFromParent();
4332     return true;
4333   }
4334   case Intrinsic::amdgcn_s_buffer_load:
4335     return legalizeSBufferLoad(MI, B, Helper.Observer);
4336   case Intrinsic::amdgcn_raw_buffer_store:
4337   case Intrinsic::amdgcn_struct_buffer_store:
4338     return legalizeBufferStore(MI, MRI, B, false, false);
4339   case Intrinsic::amdgcn_raw_buffer_store_format:
4340   case Intrinsic::amdgcn_struct_buffer_store_format:
4341     return legalizeBufferStore(MI, MRI, B, false, true);
4342   case Intrinsic::amdgcn_raw_tbuffer_store:
4343   case Intrinsic::amdgcn_struct_tbuffer_store:
4344     return legalizeBufferStore(MI, MRI, B, true, true);
4345   case Intrinsic::amdgcn_raw_buffer_load:
4346   case Intrinsic::amdgcn_struct_buffer_load:
4347     return legalizeBufferLoad(MI, MRI, B, false, false);
4348   case Intrinsic::amdgcn_raw_buffer_load_format:
4349   case Intrinsic::amdgcn_struct_buffer_load_format:
4350     return legalizeBufferLoad(MI, MRI, B, true, false);
4351   case Intrinsic::amdgcn_raw_tbuffer_load:
4352   case Intrinsic::amdgcn_struct_tbuffer_load:
4353     return legalizeBufferLoad(MI, MRI, B, true, true);
4354   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4355   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4356   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4357   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4358   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4359   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4360   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4361   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4362   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4363   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4364   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4365   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4366   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4374   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4375   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4376   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4377   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4378   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4379   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4380     return legalizeBufferAtomic(MI, B, IntrID);
4381   case Intrinsic::amdgcn_atomic_inc:
4382     return legalizeAtomicIncDec(MI, B, true);
4383   case Intrinsic::amdgcn_atomic_dec:
4384     return legalizeAtomicIncDec(MI, B, false);
4385   case Intrinsic::trap:
4386     return legalizeTrapIntrinsic(MI, MRI, B);
4387   case Intrinsic::debugtrap:
4388     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4389   default: {
4390     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4391             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4392       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4393     return true;
4394   }
4395   }
4396 
4397   return true;
4398 }
4399