1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size < 32) {
125       // <2 x s8> -> s16
126       assert(Size == 16);
127       CoercedTy = LLT::scalar(16);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616     FMad.customFor({S32, S16});
617   else if (ST.hasMadMacF32Insts())
618     FMad.customFor({S32});
619   else if (ST.hasMadF16())
620     FMad.customFor({S16});
621   FMad.scalarize(0)
622       .lower();
623 
624   // TODO: Do we need to clamp maximum bitwidth?
625   getActionDefinitionsBuilder(G_TRUNC)
626     .legalIf(isScalar(0))
627     .legalFor({{V2S16, V2S32}})
628     .clampMaxNumElements(0, S16, 2)
629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630     // situations (like an invalid implicit use), we don't want to infinite loop
631     // in the legalizer.
632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633     .alwaysLegal();
634 
635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637                {S32, S1}, {S64, S1}, {S16, S1}})
638     .scalarize(0)
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(1, 32);
641 
642   // TODO: Split s1->s64 during regbankselect for VALU.
643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645     .lowerFor({{S32, S64}})
646     .lowerIf(typeIs(1, S1))
647     .customFor({{S64, S64}});
648   if (ST.has16BitInsts())
649     IToFP.legalFor({{S16, S16}});
650   IToFP.clampScalar(1, S32, S64)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}});
657   if (ST.has16BitInsts())
658     FPToI.legalFor({{S16, S16}});
659   else
660     FPToI.minScalar(1, S32);
661 
662   FPToI.minScalar(0, S32)
663        .scalarize(0)
664        .lower();
665 
666   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667     .scalarize(0)
668     .lower();
669 
670   if (ST.has16BitInsts()) {
671     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672       .legalFor({S16, S32, S64})
673       .clampScalar(0, S16, S64)
674       .scalarize(0);
675   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
676     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
677       .legalFor({S32, S64})
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   } else {
681     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
682       .legalFor({S32})
683       .customFor({S64})
684       .clampScalar(0, S32, S64)
685       .scalarize(0);
686   }
687 
688   // FIXME: Clamp offset operand.
689   getActionDefinitionsBuilder(G_PTR_ADD)
690     .legalIf(isPointer(0))
691     .scalarize(0);
692 
693   getActionDefinitionsBuilder(G_PTRMASK)
694     .legalIf(typeInSet(1, {S64, S32}))
695     .minScalar(1, S32)
696     .maxScalarIf(sizeIs(0, 32), 1, S32)
697     .maxScalarIf(sizeIs(0, 64), 1, S64)
698     .scalarize(0);
699 
700   auto &CmpBuilder =
701     getActionDefinitionsBuilder(G_ICMP)
702     // The compare output type differs based on the register bank of the output,
703     // so make both s1 and s32 legal.
704     //
705     // Scalar compares producing output in scc will be promoted to s32, as that
706     // is the allocatable register type that will be needed for the copy from
707     // scc. This will be promoted during RegBankSelect, and we assume something
708     // before that won't try to use s32 result types.
709     //
710     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711     // bank.
712     .legalForCartesianProduct(
713       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714     .legalForCartesianProduct(
715       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
716   if (ST.has16BitInsts()) {
717     CmpBuilder.legalFor({{S1, S16}});
718   }
719 
720   CmpBuilder
721     .widenScalarToNextPow2(1)
722     .clampScalar(1, S32, S64)
723     .scalarize(0)
724     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
725 
726   getActionDefinitionsBuilder(G_FCMP)
727     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
728     .widenScalarToNextPow2(1)
729     .clampScalar(1, S32, S64)
730     .scalarize(0);
731 
732   // FIXME: fpow has a selection pattern that should move to custom lowering.
733   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734   if (ST.has16BitInsts())
735     Exp2Ops.legalFor({S32, S16});
736   else
737     Exp2Ops.legalFor({S32});
738   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739   Exp2Ops.scalarize(0);
740 
741   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742   if (ST.has16BitInsts())
743     ExpOps.customFor({{S32}, {S16}});
744   else
745     ExpOps.customFor({S32});
746   ExpOps.clampScalar(0, MinScalarFPTy, S32)
747         .scalarize(0);
748 
749   // The 64-bit versions produce 32-bit results, but only on the SALU.
750   getActionDefinitionsBuilder(G_CTPOP)
751     .legalFor({{S32, S32}, {S32, S64}})
752     .clampScalar(0, S32, S32)
753     .clampScalar(1, S32, S64)
754     .scalarize(0)
755     .widenScalarToNextPow2(0, 32)
756     .widenScalarToNextPow2(1, 32);
757 
758   // The hardware instructions return a different result on 0 than the generic
759   // instructions expect. The hardware produces -1, but these produce the
760   // bitwidth.
761   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762     .scalarize(0)
763     .clampScalar(0, S32, S32)
764     .clampScalar(1, S32, S64)
765     .widenScalarToNextPow2(0, 32)
766     .widenScalarToNextPow2(1, 32)
767     .lower();
768 
769   // The 64-bit versions produce 32-bit results, but only on the SALU.
770   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771     .legalFor({{S32, S32}, {S32, S64}})
772     .clampScalar(0, S32, S32)
773     .clampScalar(1, S32, S64)
774     .scalarize(0)
775     .widenScalarToNextPow2(0, 32)
776     .widenScalarToNextPow2(1, 32);
777 
778   getActionDefinitionsBuilder(G_BITREVERSE)
779     .legalFor({S32})
780     .clampScalar(0, S32, S32)
781     .scalarize(0);
782 
783   if (ST.has16BitInsts()) {
784     getActionDefinitionsBuilder(G_BSWAP)
785       .legalFor({S16, S32, V2S16})
786       .clampMaxNumElements(0, S16, 2)
787       // FIXME: Fixing non-power-of-2 before clamp is workaround for
788       // narrowScalar limitation.
789       .widenScalarToNextPow2(0)
790       .clampScalar(0, S16, S32)
791       .scalarize(0);
792 
793     if (ST.hasVOP3PInsts()) {
794       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
795         .legalFor({S32, S16, V2S16})
796         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
797         .clampMaxNumElements(0, S16, 2)
798         .minScalar(0, S16)
799         .widenScalarToNextPow2(0)
800         .scalarize(0)
801         .lower();
802     } else {
803       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
804         .legalFor({S32, S16})
805         .widenScalarToNextPow2(0)
806         .minScalar(0, S16)
807         .scalarize(0)
808         .lower();
809     }
810   } else {
811     // TODO: Should have same legality without v_perm_b32
812     getActionDefinitionsBuilder(G_BSWAP)
813       .legalFor({S32})
814       .lowerIf(scalarNarrowerThan(0, 32))
815       // FIXME: Fixing non-power-of-2 before clamp is workaround for
816       // narrowScalar limitation.
817       .widenScalarToNextPow2(0)
818       .maxScalar(0, S32)
819       .scalarize(0)
820       .lower();
821 
822     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
823       .legalFor({S32})
824       .minScalar(0, S32)
825       .widenScalarToNextPow2(0)
826       .scalarize(0)
827       .lower();
828   }
829 
830   getActionDefinitionsBuilder(G_INTTOPTR)
831     // List the common cases
832     .legalForCartesianProduct(AddrSpaces64, {S64})
833     .legalForCartesianProduct(AddrSpaces32, {S32})
834     .scalarize(0)
835     // Accept any address space as long as the size matches
836     .legalIf(sameSize(0, 1))
837     .widenScalarIf(smallerThan(1, 0),
838       [](const LegalityQuery &Query) {
839         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
840       })
841     .narrowScalarIf(largerThan(1, 0),
842       [](const LegalityQuery &Query) {
843         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
844       });
845 
846   getActionDefinitionsBuilder(G_PTRTOINT)
847     // List the common cases
848     .legalForCartesianProduct(AddrSpaces64, {S64})
849     .legalForCartesianProduct(AddrSpaces32, {S32})
850     .scalarize(0)
851     // Accept any address space as long as the size matches
852     .legalIf(sameSize(0, 1))
853     .widenScalarIf(smallerThan(0, 1),
854       [](const LegalityQuery &Query) {
855         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
856       })
857     .narrowScalarIf(
858       largerThan(0, 1),
859       [](const LegalityQuery &Query) {
860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861       });
862 
863   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
864     .scalarize(0)
865     .custom();
866 
867   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868                                     bool IsLoad) -> bool {
869     const LLT DstTy = Query.Types[0];
870 
871     // Split vector extloads.
872     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873     unsigned Align = Query.MMODescrs[0].AlignInBits;
874 
875     if (MemSize < DstTy.getSizeInBits())
876       MemSize = std::max(MemSize, Align);
877 
878     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
879       return true;
880 
881     const LLT PtrTy = Query.Types[1];
882     unsigned AS = PtrTy.getAddressSpace();
883     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
884       return true;
885 
886     // Catch weird sized loads that don't evenly divide into the access sizes
887     // TODO: May be able to widen depending on alignment etc.
888     unsigned NumRegs = (MemSize + 31) / 32;
889     if (NumRegs == 3) {
890       if (!ST.hasDwordx3LoadStores())
891         return true;
892     } else {
893       // If the alignment allows, these should have been widened.
894       if (!isPowerOf2_32(NumRegs))
895         return true;
896     }
897 
898     if (Align < MemSize) {
899       const SITargetLowering *TLI = ST.getTargetLowering();
900       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
901     }
902 
903     return false;
904   };
905 
906   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907                                          unsigned Opc) -> bool {
908     unsigned Size = Query.Types[0].getSizeInBits();
909     if (isPowerOf2_32(Size))
910       return false;
911 
912     if (Size == 96 && ST.hasDwordx3LoadStores())
913       return false;
914 
915     unsigned AddrSpace = Query.Types[1].getAddressSpace();
916     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917       return false;
918 
919     unsigned Align = Query.MMODescrs[0].AlignInBits;
920     unsigned RoundedSize = NextPowerOf2(Size);
921     return (Align >= RoundedSize);
922   };
923 
924   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
925   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
926   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
927 
928   // TODO: Refine based on subtargets which support unaligned access or 128-bit
929   // LDS
930   // TODO: Unsupported flat for SI.
931 
932   for (unsigned Op : {G_LOAD, G_STORE}) {
933     const bool IsStore = Op == G_STORE;
934 
935     auto &Actions = getActionDefinitionsBuilder(Op);
936     // Explicitly list some common cases.
937     // TODO: Does this help compile time at all?
938     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
939                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
940                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
941                                       {S64, GlobalPtr, 64, GlobalAlign32},
942                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
943                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
944                                       {S32, GlobalPtr, 8, GlobalAlign8},
945                                       {S32, GlobalPtr, 16, GlobalAlign16},
946 
947                                       {S32, LocalPtr, 32, 32},
948                                       {S64, LocalPtr, 64, 32},
949                                       {V2S32, LocalPtr, 64, 32},
950                                       {S32, LocalPtr, 8, 8},
951                                       {S32, LocalPtr, 16, 16},
952                                       {V2S16, LocalPtr, 32, 32},
953 
954                                       {S32, PrivatePtr, 32, 32},
955                                       {S32, PrivatePtr, 8, 8},
956                                       {S32, PrivatePtr, 16, 16},
957                                       {V2S16, PrivatePtr, 32, 32},
958 
959                                       {S32, ConstantPtr, 32, GlobalAlign32},
960                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
961                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
962                                       {S64, ConstantPtr, 64, GlobalAlign32},
963                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
964     Actions.legalIf(
965       [=](const LegalityQuery &Query) -> bool {
966         return isLoadStoreLegal(ST, Query, Op);
967       });
968 
969     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970     // 64-bits.
971     //
972     // TODO: Should generalize bitcast action into coerce, which will also cover
973     // inserting addrspacecasts.
974     Actions.customIf(typeIs(1, Constant32Ptr));
975 
976     // Turn any illegal element vectors into something easier to deal
977     // with. These will ultimately produce 32-bit scalar shifts to extract the
978     // parts anyway.
979     //
980     // For odd 16-bit element vectors, prefer to split those into pieces with
981     // 16-bit vector parts.
982     Actions.bitcastIf(
983       [=](const LegalityQuery &Query) -> bool {
984         const LLT Ty = Query.Types[0];
985 
986         // Do not cast an extload/truncstore.
987         if (Ty.getSizeInBits() != Query.MMODescrs[0].SizeInBits)
988           return false;
989 
990         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991           return true;
992         const unsigned Size = Ty.getSizeInBits();
993         return Ty.isVector() && isRegisterSize(Size) &&
994                !isRegisterVectorElementType(Ty.getElementType());
995       }, bitcastToRegisterType(0));
996 
997     Actions
998         .customIf(typeIs(1, Constant32Ptr))
999         // Widen suitably aligned loads by loading extra elements.
1000         .moreElementsIf([=](const LegalityQuery &Query) {
1001             const LLT Ty = Query.Types[0];
1002             return Op == G_LOAD && Ty.isVector() &&
1003                    shouldWidenLoadResult(Query, Op);
1004           }, moreElementsToNextPow2(0))
1005         .widenScalarIf([=](const LegalityQuery &Query) {
1006             const LLT Ty = Query.Types[0];
1007             return Op == G_LOAD && !Ty.isVector() &&
1008                    shouldWidenLoadResult(Query, Op);
1009           }, widenScalarOrEltToNextPow2(0))
1010         .narrowScalarIf(
1011             [=](const LegalityQuery &Query) -> bool {
1012               return !Query.Types[0].isVector() &&
1013                      needToSplitMemOp(Query, Op == G_LOAD);
1014             },
1015             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1016               const LLT DstTy = Query.Types[0];
1017               const LLT PtrTy = Query.Types[1];
1018 
1019               const unsigned DstSize = DstTy.getSizeInBits();
1020               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1021 
1022               // Split extloads.
1023               if (DstSize > MemSize)
1024                 return std::make_pair(0, LLT::scalar(MemSize));
1025 
1026               if (!isPowerOf2_32(DstSize)) {
1027                 // We're probably decomposing an odd sized store. Try to split
1028                 // to the widest type. TODO: Account for alignment. As-is it
1029                 // should be OK, since the new parts will be further legalized.
1030                 unsigned FloorSize = PowerOf2Floor(DstSize);
1031                 return std::make_pair(0, LLT::scalar(FloorSize));
1032               }
1033 
1034               if (DstSize > 32 && (DstSize % 32 != 0)) {
1035                 // FIXME: Need a way to specify non-extload of larger size if
1036                 // suitably aligned.
1037                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1038               }
1039 
1040               unsigned MaxSize = maxSizeForAddrSpace(ST,
1041                                                      PtrTy.getAddressSpace(),
1042                                                      Op == G_LOAD);
1043               if (MemSize > MaxSize)
1044                 return std::make_pair(0, LLT::scalar(MaxSize));
1045 
1046               unsigned Align = Query.MMODescrs[0].AlignInBits;
1047               return std::make_pair(0, LLT::scalar(Align));
1048             })
1049         .fewerElementsIf(
1050             [=](const LegalityQuery &Query) -> bool {
1051               return Query.Types[0].isVector() &&
1052                      needToSplitMemOp(Query, Op == G_LOAD);
1053             },
1054             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1055               const LLT DstTy = Query.Types[0];
1056               const LLT PtrTy = Query.Types[1];
1057 
1058               LLT EltTy = DstTy.getElementType();
1059               unsigned MaxSize = maxSizeForAddrSpace(ST,
1060                                                      PtrTy.getAddressSpace(),
1061                                                      Op == G_LOAD);
1062 
1063               // FIXME: Handle widened to power of 2 results better. This ends
1064               // up scalarizing.
1065               // FIXME: 3 element stores scalarized on SI
1066 
1067               // Split if it's too large for the address space.
1068               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1069                 unsigned NumElts = DstTy.getNumElements();
1070                 unsigned EltSize = EltTy.getSizeInBits();
1071 
1072                 if (MaxSize % EltSize == 0) {
1073                   return std::make_pair(
1074                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1075                 }
1076 
1077                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1078 
1079                 // FIXME: Refine when odd breakdowns handled
1080                 // The scalars will need to be re-legalized.
1081                 if (NumPieces == 1 || NumPieces >= NumElts ||
1082                     NumElts % NumPieces != 0)
1083                   return std::make_pair(0, EltTy);
1084 
1085                 return std::make_pair(0,
1086                                       LLT::vector(NumElts / NumPieces, EltTy));
1087               }
1088 
1089               // FIXME: We could probably handle weird extending loads better.
1090               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1091               if (DstTy.getSizeInBits() > MemSize)
1092                 return std::make_pair(0, EltTy);
1093 
1094               unsigned EltSize = EltTy.getSizeInBits();
1095               unsigned DstSize = DstTy.getSizeInBits();
1096               if (!isPowerOf2_32(DstSize)) {
1097                 // We're probably decomposing an odd sized store. Try to split
1098                 // to the widest type. TODO: Account for alignment. As-is it
1099                 // should be OK, since the new parts will be further legalized.
1100                 unsigned FloorSize = PowerOf2Floor(DstSize);
1101                 return std::make_pair(
1102                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1103               }
1104 
1105               // Need to split because of alignment.
1106               unsigned Align = Query.MMODescrs[0].AlignInBits;
1107               if (EltSize > Align &&
1108                   (EltSize / Align < DstTy.getNumElements())) {
1109                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1110               }
1111 
1112               // May need relegalization for the scalars.
1113               return std::make_pair(0, EltTy);
1114             })
1115         .minScalar(0, S32);
1116 
1117     if (IsStore)
1118       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1119 
1120     // TODO: Need a bitcast lower option?
1121     Actions
1122         .widenScalarToNextPow2(0)
1123         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1124   }
1125 
1126   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1127                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1128                                                   {S32, GlobalPtr, 16, 2 * 8},
1129                                                   {S32, LocalPtr, 8, 8},
1130                                                   {S32, LocalPtr, 16, 16},
1131                                                   {S32, PrivatePtr, 8, 8},
1132                                                   {S32, PrivatePtr, 16, 16},
1133                                                   {S32, ConstantPtr, 8, 8},
1134                                                   {S32, ConstantPtr, 16, 2 * 8}});
1135   if (ST.hasFlatAddressSpace()) {
1136     ExtLoads.legalForTypesWithMemDesc(
1137         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1138   }
1139 
1140   ExtLoads.clampScalar(0, S32, S32)
1141           .widenScalarToNextPow2(0)
1142           .unsupportedIfMemSizeNotPow2()
1143           .lower();
1144 
1145   auto &Atomics = getActionDefinitionsBuilder(
1146     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1147      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1148      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1149      G_ATOMICRMW_UMIN})
1150     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1151                {S64, GlobalPtr}, {S64, LocalPtr}});
1152   if (ST.hasFlatAddressSpace()) {
1153     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1154   }
1155 
1156   if (ST.hasLDSFPAtomics()) {
1157     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1158       .legalFor({{S32, LocalPtr}});
1159   }
1160 
1161   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1162   // demarshalling
1163   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1164     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1165                 {S32, FlatPtr}, {S64, FlatPtr}})
1166     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1167                {S32, RegionPtr}, {S64, RegionPtr}});
1168   // TODO: Pointer types, any 32-bit or 64-bit vector
1169 
1170   // Condition should be s32 for scalar, s1 for vector.
1171   getActionDefinitionsBuilder(G_SELECT)
1172     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1173           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1174           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1175     .clampScalar(0, S16, S64)
1176     .scalarize(1)
1177     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1178     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1179     .clampMaxNumElements(0, S32, 2)
1180     .clampMaxNumElements(0, LocalPtr, 2)
1181     .clampMaxNumElements(0, PrivatePtr, 2)
1182     .scalarize(0)
1183     .widenScalarToNextPow2(0)
1184     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1185 
1186   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1187   // be more flexible with the shift amount type.
1188   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1189     .legalFor({{S32, S32}, {S64, S32}});
1190   if (ST.has16BitInsts()) {
1191     if (ST.hasVOP3PInsts()) {
1192       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1193             .clampMaxNumElements(0, S16, 2);
1194     } else
1195       Shifts.legalFor({{S16, S16}});
1196 
1197     // TODO: Support 16-bit shift amounts for all types
1198     Shifts.widenScalarIf(
1199       [=](const LegalityQuery &Query) {
1200         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1201         // 32-bit amount.
1202         const LLT ValTy = Query.Types[0];
1203         const LLT AmountTy = Query.Types[1];
1204         return ValTy.getSizeInBits() <= 16 &&
1205                AmountTy.getSizeInBits() < 16;
1206       }, changeTo(1, S16));
1207     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1208     Shifts.clampScalar(1, S32, S32);
1209     Shifts.clampScalar(0, S16, S64);
1210     Shifts.widenScalarToNextPow2(0, 16);
1211   } else {
1212     // Make sure we legalize the shift amount type first, as the general
1213     // expansion for the shifted type will produce much worse code if it hasn't
1214     // been truncated already.
1215     Shifts.clampScalar(1, S32, S32);
1216     Shifts.clampScalar(0, S32, S64);
1217     Shifts.widenScalarToNextPow2(0, 32);
1218   }
1219   Shifts.scalarize(0);
1220 
1221   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1222     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1223     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1224     unsigned IdxTypeIdx = 2;
1225 
1226     getActionDefinitionsBuilder(Op)
1227       .customIf([=](const LegalityQuery &Query) {
1228           const LLT EltTy = Query.Types[EltTypeIdx];
1229           const LLT VecTy = Query.Types[VecTypeIdx];
1230           const LLT IdxTy = Query.Types[IdxTypeIdx];
1231           return (EltTy.getSizeInBits() == 16 ||
1232                   EltTy.getSizeInBits() % 32 == 0) &&
1233                  VecTy.getSizeInBits() % 32 == 0 &&
1234                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1235                  IdxTy.getSizeInBits() == 32;
1236         })
1237       .clampScalar(EltTypeIdx, S32, S64)
1238       .clampScalar(VecTypeIdx, S32, S64)
1239       .clampScalar(IdxTypeIdx, S32, S32);
1240   }
1241 
1242   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1243     .unsupportedIf([=](const LegalityQuery &Query) {
1244         const LLT &EltTy = Query.Types[1].getElementType();
1245         return Query.Types[0] != EltTy;
1246       });
1247 
1248   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1249     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1250     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1251 
1252     // FIXME: Doesn't handle extract of illegal sizes.
1253     getActionDefinitionsBuilder(Op)
1254       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1255       // FIXME: Multiples of 16 should not be legal.
1256       .legalIf([=](const LegalityQuery &Query) {
1257           const LLT BigTy = Query.Types[BigTyIdx];
1258           const LLT LitTy = Query.Types[LitTyIdx];
1259           return (BigTy.getSizeInBits() % 32 == 0) &&
1260                  (LitTy.getSizeInBits() % 16 == 0);
1261         })
1262       .widenScalarIf(
1263         [=](const LegalityQuery &Query) {
1264           const LLT BigTy = Query.Types[BigTyIdx];
1265           return (BigTy.getScalarSizeInBits() < 16);
1266         },
1267         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1268       .widenScalarIf(
1269         [=](const LegalityQuery &Query) {
1270           const LLT LitTy = Query.Types[LitTyIdx];
1271           return (LitTy.getScalarSizeInBits() < 16);
1272         },
1273         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1274       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1275       .widenScalarToNextPow2(BigTyIdx, 32);
1276 
1277   }
1278 
1279   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1280     .legalForCartesianProduct(AllS32Vectors, {S32})
1281     .legalForCartesianProduct(AllS64Vectors, {S64})
1282     .clampNumElements(0, V16S32, V32S32)
1283     .clampNumElements(0, V2S64, V16S64)
1284     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1285 
1286   if (ST.hasScalarPackInsts()) {
1287     BuildVector
1288       // FIXME: Should probably widen s1 vectors straight to s32
1289       .minScalarOrElt(0, S16)
1290       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1291       .minScalar(1, S32);
1292 
1293     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1294       .legalFor({V2S16, S32})
1295       .lower();
1296     BuildVector.minScalarOrElt(0, S32);
1297   } else {
1298     BuildVector.customFor({V2S16, S16});
1299     BuildVector.minScalarOrElt(0, S32);
1300 
1301     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1302       .customFor({V2S16, S32})
1303       .lower();
1304   }
1305 
1306   BuildVector.legalIf(isRegisterType(0));
1307 
1308   // FIXME: Clamp maximum size
1309   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1310     .legalIf(isRegisterType(0));
1311 
1312   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1313   // pre-legalize.
1314   if (ST.hasVOP3PInsts()) {
1315     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1316       .customFor({V2S16, V2S16})
1317       .lower();
1318   } else
1319     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1320 
1321   // Merge/Unmerge
1322   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1323     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1324     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1325 
1326     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1327       const LLT Ty = Query.Types[TypeIdx];
1328       if (Ty.isVector()) {
1329         const LLT &EltTy = Ty.getElementType();
1330         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1331           return true;
1332         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1333           return true;
1334       }
1335       return false;
1336     };
1337 
1338     auto &Builder = getActionDefinitionsBuilder(Op)
1339       .lowerFor({{S16, V2S16}})
1340       .lowerIf([=](const LegalityQuery &Query) {
1341           const LLT BigTy = Query.Types[BigTyIdx];
1342           return BigTy.getSizeInBits() == 32;
1343         })
1344       // Try to widen to s16 first for small types.
1345       // TODO: Only do this on targets with legal s16 shifts
1346       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1347       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1348       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1349       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1350                            elementTypeIs(1, S16)),
1351                        changeTo(1, V2S16))
1352       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1353       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1354       // valid.
1355       .clampScalar(LitTyIdx, S32, S512)
1356       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1357       // Break up vectors with weird elements into scalars
1358       .fewerElementsIf(
1359         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1360         scalarize(0))
1361       .fewerElementsIf(
1362         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1363         scalarize(1))
1364       .clampScalar(BigTyIdx, S32, MaxScalar);
1365 
1366     if (Op == G_MERGE_VALUES) {
1367       Builder.widenScalarIf(
1368         // TODO: Use 16-bit shifts if legal for 8-bit values?
1369         [=](const LegalityQuery &Query) {
1370           const LLT Ty = Query.Types[LitTyIdx];
1371           return Ty.getSizeInBits() < 32;
1372         },
1373         changeTo(LitTyIdx, S32));
1374     }
1375 
1376     Builder.widenScalarIf(
1377       [=](const LegalityQuery &Query) {
1378         const LLT Ty = Query.Types[BigTyIdx];
1379         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1380           Ty.getSizeInBits() % 16 != 0;
1381       },
1382       [=](const LegalityQuery &Query) {
1383         // Pick the next power of 2, or a multiple of 64 over 128.
1384         // Whichever is smaller.
1385         const LLT &Ty = Query.Types[BigTyIdx];
1386         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1387         if (NewSizeInBits >= 256) {
1388           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1389           if (RoundedTo < NewSizeInBits)
1390             NewSizeInBits = RoundedTo;
1391         }
1392         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1393       })
1394       .legalIf([=](const LegalityQuery &Query) {
1395           const LLT &BigTy = Query.Types[BigTyIdx];
1396           const LLT &LitTy = Query.Types[LitTyIdx];
1397 
1398           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1399             return false;
1400           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1401             return false;
1402 
1403           return BigTy.getSizeInBits() % 16 == 0 &&
1404                  LitTy.getSizeInBits() % 16 == 0 &&
1405                  BigTy.getSizeInBits() <= MaxRegisterSize;
1406         })
1407       // Any vectors left are the wrong size. Scalarize them.
1408       .scalarize(0)
1409       .scalarize(1);
1410   }
1411 
1412   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1413   // RegBankSelect.
1414   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1415     .legalFor({{S32}, {S64}});
1416 
1417   if (ST.hasVOP3PInsts()) {
1418     SextInReg.lowerFor({{V2S16}})
1419       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1420       // get more vector shift opportunities, since we'll get those when
1421       // expanded.
1422       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1423   } else if (ST.has16BitInsts()) {
1424     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1425   } else {
1426     // Prefer to promote to s32 before lowering if we don't have 16-bit
1427     // shifts. This avoid a lot of intermediate truncate and extend operations.
1428     SextInReg.lowerFor({{S32}, {S64}});
1429   }
1430 
1431   SextInReg
1432     .scalarize(0)
1433     .clampScalar(0, S32, S64)
1434     .lower();
1435 
1436   getActionDefinitionsBuilder(G_FSHR)
1437     .legalFor({{S32, S32}})
1438     .scalarize(0)
1439     .lower();
1440 
1441   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1442     .legalFor({S64});
1443 
1444   getActionDefinitionsBuilder({
1445       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1446       G_FCOPYSIGN,
1447 
1448       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1449       G_READ_REGISTER,
1450       G_WRITE_REGISTER,
1451 
1452       G_SADDO, G_SSUBO,
1453 
1454        // TODO: Implement
1455       G_FMINIMUM, G_FMAXIMUM,
1456       G_FSHL
1457     }).lower();
1458 
1459   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1460         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1461         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1462     .unsupported();
1463 
1464   computeTables();
1465   verify(*ST.getInstrInfo());
1466 }
1467 
1468 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1469                                          MachineInstr &MI) const {
1470   MachineIRBuilder &B = Helper.MIRBuilder;
1471   MachineRegisterInfo &MRI = *B.getMRI();
1472   GISelChangeObserver &Observer = Helper.Observer;
1473 
1474   switch (MI.getOpcode()) {
1475   case TargetOpcode::G_ADDRSPACE_CAST:
1476     return legalizeAddrSpaceCast(MI, MRI, B);
1477   case TargetOpcode::G_FRINT:
1478     return legalizeFrint(MI, MRI, B);
1479   case TargetOpcode::G_FCEIL:
1480     return legalizeFceil(MI, MRI, B);
1481   case TargetOpcode::G_INTRINSIC_TRUNC:
1482     return legalizeIntrinsicTrunc(MI, MRI, B);
1483   case TargetOpcode::G_SITOFP:
1484     return legalizeITOFP(MI, MRI, B, true);
1485   case TargetOpcode::G_UITOFP:
1486     return legalizeITOFP(MI, MRI, B, false);
1487   case TargetOpcode::G_FPTOSI:
1488     return legalizeFPTOI(MI, MRI, B, true);
1489   case TargetOpcode::G_FPTOUI:
1490     return legalizeFPTOI(MI, MRI, B, false);
1491   case TargetOpcode::G_FMINNUM:
1492   case TargetOpcode::G_FMAXNUM:
1493   case TargetOpcode::G_FMINNUM_IEEE:
1494   case TargetOpcode::G_FMAXNUM_IEEE:
1495     return legalizeMinNumMaxNum(Helper, MI);
1496   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1497     return legalizeExtractVectorElt(MI, MRI, B);
1498   case TargetOpcode::G_INSERT_VECTOR_ELT:
1499     return legalizeInsertVectorElt(MI, MRI, B);
1500   case TargetOpcode::G_SHUFFLE_VECTOR:
1501     return legalizeShuffleVector(MI, MRI, B);
1502   case TargetOpcode::G_FSIN:
1503   case TargetOpcode::G_FCOS:
1504     return legalizeSinCos(MI, MRI, B);
1505   case TargetOpcode::G_GLOBAL_VALUE:
1506     return legalizeGlobalValue(MI, MRI, B);
1507   case TargetOpcode::G_LOAD:
1508     return legalizeLoad(MI, MRI, B, Observer);
1509   case TargetOpcode::G_FMAD:
1510     return legalizeFMad(MI, MRI, B);
1511   case TargetOpcode::G_FDIV:
1512     return legalizeFDIV(MI, MRI, B);
1513   case TargetOpcode::G_UDIV:
1514   case TargetOpcode::G_UREM:
1515     return legalizeUDIV_UREM(MI, MRI, B);
1516   case TargetOpcode::G_SDIV:
1517   case TargetOpcode::G_SREM:
1518     return legalizeSDIV_SREM(MI, MRI, B);
1519   case TargetOpcode::G_ATOMIC_CMPXCHG:
1520     return legalizeAtomicCmpXChg(MI, MRI, B);
1521   case TargetOpcode::G_FLOG:
1522     return legalizeFlog(MI, B, numbers::ln2f);
1523   case TargetOpcode::G_FLOG10:
1524     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1525   case TargetOpcode::G_FEXP:
1526     return legalizeFExp(MI, B);
1527   case TargetOpcode::G_FPOW:
1528     return legalizeFPow(MI, B);
1529   case TargetOpcode::G_FFLOOR:
1530     return legalizeFFloor(MI, MRI, B);
1531   case TargetOpcode::G_BUILD_VECTOR:
1532     return legalizeBuildVector(MI, MRI, B);
1533   default:
1534     return false;
1535   }
1536 
1537   llvm_unreachable("expected switch to return");
1538 }
1539 
1540 Register AMDGPULegalizerInfo::getSegmentAperture(
1541   unsigned AS,
1542   MachineRegisterInfo &MRI,
1543   MachineIRBuilder &B) const {
1544   MachineFunction &MF = B.getMF();
1545   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1546   const LLT S32 = LLT::scalar(32);
1547 
1548   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1549 
1550   if (ST.hasApertureRegs()) {
1551     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1552     // getreg.
1553     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1554         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1555         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1556     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1557         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1558         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1559     unsigned Encoding =
1560         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1561         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1562         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1563 
1564     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1565 
1566     B.buildInstr(AMDGPU::S_GETREG_B32)
1567       .addDef(GetReg)
1568       .addImm(Encoding);
1569     MRI.setType(GetReg, S32);
1570 
1571     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1572     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1573   }
1574 
1575   Register QueuePtr = MRI.createGenericVirtualRegister(
1576     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1577 
1578   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1579   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1580     return Register();
1581 
1582   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1583   // private_segment_aperture_base_hi.
1584   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1585 
1586   // TODO: can we be smarter about machine pointer info?
1587   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1588   MachineMemOperand *MMO = MF.getMachineMemOperand(
1589       PtrInfo,
1590       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1591           MachineMemOperand::MOInvariant,
1592       4, commonAlignment(Align(64), StructOffset));
1593 
1594   Register LoadAddr;
1595 
1596   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1597   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1598 }
1599 
1600 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1601   MachineInstr &MI, MachineRegisterInfo &MRI,
1602   MachineIRBuilder &B) const {
1603   MachineFunction &MF = B.getMF();
1604 
1605   const LLT S32 = LLT::scalar(32);
1606   Register Dst = MI.getOperand(0).getReg();
1607   Register Src = MI.getOperand(1).getReg();
1608 
1609   LLT DstTy = MRI.getType(Dst);
1610   LLT SrcTy = MRI.getType(Src);
1611   unsigned DestAS = DstTy.getAddressSpace();
1612   unsigned SrcAS = SrcTy.getAddressSpace();
1613 
1614   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1615   // vector element.
1616   assert(!DstTy.isVector());
1617 
1618   const AMDGPUTargetMachine &TM
1619     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1620 
1621   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1622   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1623     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1624     return true;
1625   }
1626 
1627   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1628     // Truncate.
1629     B.buildExtract(Dst, Src, 0);
1630     MI.eraseFromParent();
1631     return true;
1632   }
1633 
1634   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1635     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1636     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1637 
1638     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1639     // another. Merge operands are required to be the same type, but creating an
1640     // extra ptrtoint would be kind of pointless.
1641     auto HighAddr = B.buildConstant(
1642       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1643     B.buildMerge(Dst, {Src, HighAddr});
1644     MI.eraseFromParent();
1645     return true;
1646   }
1647 
1648   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1649     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1650            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1651     unsigned NullVal = TM.getNullPointerValue(DestAS);
1652 
1653     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1654     auto FlatNull = B.buildConstant(SrcTy, 0);
1655 
1656     // Extract low 32-bits of the pointer.
1657     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1658 
1659     auto CmpRes =
1660         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1661     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1662 
1663     MI.eraseFromParent();
1664     return true;
1665   }
1666 
1667   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1668     return false;
1669 
1670   if (!ST.hasFlatAddressSpace())
1671     return false;
1672 
1673   auto SegmentNull =
1674       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1675   auto FlatNull =
1676       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1677 
1678   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1679   if (!ApertureReg.isValid())
1680     return false;
1681 
1682   auto CmpRes =
1683       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1684 
1685   // Coerce the type of the low half of the result so we can use merge_values.
1686   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1687 
1688   // TODO: Should we allow mismatched types but matching sizes in merges to
1689   // avoid the ptrtoint?
1690   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1691   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1692 
1693   MI.eraseFromParent();
1694   return true;
1695 }
1696 
1697 bool AMDGPULegalizerInfo::legalizeFrint(
1698   MachineInstr &MI, MachineRegisterInfo &MRI,
1699   MachineIRBuilder &B) const {
1700   Register Src = MI.getOperand(1).getReg();
1701   LLT Ty = MRI.getType(Src);
1702   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1703 
1704   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1705   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1706 
1707   auto C1 = B.buildFConstant(Ty, C1Val);
1708   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1709 
1710   // TODO: Should this propagate fast-math-flags?
1711   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1712   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1713 
1714   auto C2 = B.buildFConstant(Ty, C2Val);
1715   auto Fabs = B.buildFAbs(Ty, Src);
1716 
1717   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1718   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1719   return true;
1720 }
1721 
1722 bool AMDGPULegalizerInfo::legalizeFceil(
1723   MachineInstr &MI, MachineRegisterInfo &MRI,
1724   MachineIRBuilder &B) const {
1725 
1726   const LLT S1 = LLT::scalar(1);
1727   const LLT S64 = LLT::scalar(64);
1728 
1729   Register Src = MI.getOperand(1).getReg();
1730   assert(MRI.getType(Src) == S64);
1731 
1732   // result = trunc(src)
1733   // if (src > 0.0 && src != result)
1734   //   result += 1.0
1735 
1736   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1737 
1738   const auto Zero = B.buildFConstant(S64, 0.0);
1739   const auto One = B.buildFConstant(S64, 1.0);
1740   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1741   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1742   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1743   auto Add = B.buildSelect(S64, And, One, Zero);
1744 
1745   // TODO: Should this propagate fast-math-flags?
1746   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1747   return true;
1748 }
1749 
1750 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1751                                               MachineIRBuilder &B) {
1752   const unsigned FractBits = 52;
1753   const unsigned ExpBits = 11;
1754   LLT S32 = LLT::scalar(32);
1755 
1756   auto Const0 = B.buildConstant(S32, FractBits - 32);
1757   auto Const1 = B.buildConstant(S32, ExpBits);
1758 
1759   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1760     .addUse(Const0.getReg(0))
1761     .addUse(Const1.getReg(0));
1762 
1763   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1764 }
1765 
1766 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1767   MachineInstr &MI, MachineRegisterInfo &MRI,
1768   MachineIRBuilder &B) const {
1769   const LLT S1 = LLT::scalar(1);
1770   const LLT S32 = LLT::scalar(32);
1771   const LLT S64 = LLT::scalar(64);
1772 
1773   Register Src = MI.getOperand(1).getReg();
1774   assert(MRI.getType(Src) == S64);
1775 
1776   // TODO: Should this use extract since the low half is unused?
1777   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1778   Register Hi = Unmerge.getReg(1);
1779 
1780   // Extract the upper half, since this is where we will find the sign and
1781   // exponent.
1782   auto Exp = extractF64Exponent(Hi, B);
1783 
1784   const unsigned FractBits = 52;
1785 
1786   // Extract the sign bit.
1787   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1788   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1789 
1790   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1791 
1792   const auto Zero32 = B.buildConstant(S32, 0);
1793 
1794   // Extend back to 64-bits.
1795   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1796 
1797   auto Shr = B.buildAShr(S64, FractMask, Exp);
1798   auto Not = B.buildNot(S64, Shr);
1799   auto Tmp0 = B.buildAnd(S64, Src, Not);
1800   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1801 
1802   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1803   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1804 
1805   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1806   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1807   return true;
1808 }
1809 
1810 bool AMDGPULegalizerInfo::legalizeITOFP(
1811   MachineInstr &MI, MachineRegisterInfo &MRI,
1812   MachineIRBuilder &B, bool Signed) const {
1813 
1814   Register Dst = MI.getOperand(0).getReg();
1815   Register Src = MI.getOperand(1).getReg();
1816 
1817   const LLT S64 = LLT::scalar(64);
1818   const LLT S32 = LLT::scalar(32);
1819 
1820   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1821 
1822   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1823 
1824   auto CvtHi = Signed ?
1825     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1826     B.buildUITOFP(S64, Unmerge.getReg(1));
1827 
1828   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1829 
1830   auto ThirtyTwo = B.buildConstant(S32, 32);
1831   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1832     .addUse(CvtHi.getReg(0))
1833     .addUse(ThirtyTwo.getReg(0));
1834 
1835   // TODO: Should this propagate fast-math-flags?
1836   B.buildFAdd(Dst, LdExp, CvtLo);
1837   MI.eraseFromParent();
1838   return true;
1839 }
1840 
1841 // TODO: Copied from DAG implementation. Verify logic and document how this
1842 // actually works.
1843 bool AMDGPULegalizerInfo::legalizeFPTOI(
1844   MachineInstr &MI, MachineRegisterInfo &MRI,
1845   MachineIRBuilder &B, bool Signed) const {
1846 
1847   Register Dst = MI.getOperand(0).getReg();
1848   Register Src = MI.getOperand(1).getReg();
1849 
1850   const LLT S64 = LLT::scalar(64);
1851   const LLT S32 = LLT::scalar(32);
1852 
1853   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1854 
1855   unsigned Flags = MI.getFlags();
1856 
1857   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1858   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1859   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1860 
1861   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1862   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1863   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1864 
1865   auto Hi = Signed ?
1866     B.buildFPTOSI(S32, FloorMul) :
1867     B.buildFPTOUI(S32, FloorMul);
1868   auto Lo = B.buildFPTOUI(S32, Fma);
1869 
1870   B.buildMerge(Dst, { Lo, Hi });
1871   MI.eraseFromParent();
1872 
1873   return true;
1874 }
1875 
1876 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1877                                                MachineInstr &MI) const {
1878   MachineFunction &MF = Helper.MIRBuilder.getMF();
1879   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1880 
1881   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1882                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1883 
1884   // With ieee_mode disabled, the instructions have the correct behavior
1885   // already for G_FMINNUM/G_FMAXNUM
1886   if (!MFI->getMode().IEEE)
1887     return !IsIEEEOp;
1888 
1889   if (IsIEEEOp)
1890     return true;
1891 
1892   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1893 }
1894 
1895 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1896   MachineInstr &MI, MachineRegisterInfo &MRI,
1897   MachineIRBuilder &B) const {
1898   // TODO: Should move some of this into LegalizerHelper.
1899 
1900   // TODO: Promote dynamic indexing of s16 to s32
1901 
1902   // FIXME: Artifact combiner probably should have replaced the truncated
1903   // constant before this, so we shouldn't need
1904   // getConstantVRegValWithLookThrough.
1905   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1906     MI.getOperand(2).getReg(), MRI);
1907   if (!IdxVal) // Dynamic case will be selected to register indexing.
1908     return true;
1909 
1910   Register Dst = MI.getOperand(0).getReg();
1911   Register Vec = MI.getOperand(1).getReg();
1912 
1913   LLT VecTy = MRI.getType(Vec);
1914   LLT EltTy = VecTy.getElementType();
1915   assert(EltTy == MRI.getType(Dst));
1916 
1917   if (IdxVal->Value < VecTy.getNumElements())
1918     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1919   else
1920     B.buildUndef(Dst);
1921 
1922   MI.eraseFromParent();
1923   return true;
1924 }
1925 
1926 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   // TODO: Should move some of this into LegalizerHelper.
1930 
1931   // TODO: Promote dynamic indexing of s16 to s32
1932 
1933   // FIXME: Artifact combiner probably should have replaced the truncated
1934   // constant before this, so we shouldn't need
1935   // getConstantVRegValWithLookThrough.
1936   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1937     MI.getOperand(3).getReg(), MRI);
1938   if (!IdxVal) // Dynamic case will be selected to register indexing.
1939     return true;
1940 
1941   Register Dst = MI.getOperand(0).getReg();
1942   Register Vec = MI.getOperand(1).getReg();
1943   Register Ins = MI.getOperand(2).getReg();
1944 
1945   LLT VecTy = MRI.getType(Vec);
1946   LLT EltTy = VecTy.getElementType();
1947   assert(EltTy == MRI.getType(Ins));
1948 
1949   if (IdxVal->Value < VecTy.getNumElements())
1950     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1951   else
1952     B.buildUndef(Dst);
1953 
1954   MI.eraseFromParent();
1955   return true;
1956 }
1957 
1958 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1959   MachineInstr &MI, MachineRegisterInfo &MRI,
1960   MachineIRBuilder &B) const {
1961   const LLT V2S16 = LLT::vector(2, 16);
1962 
1963   Register Dst = MI.getOperand(0).getReg();
1964   Register Src0 = MI.getOperand(1).getReg();
1965   LLT DstTy = MRI.getType(Dst);
1966   LLT SrcTy = MRI.getType(Src0);
1967 
1968   if (SrcTy == V2S16 && DstTy == V2S16 &&
1969       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1970     return true;
1971 
1972   MachineIRBuilder HelperBuilder(MI);
1973   GISelObserverWrapper DummyObserver;
1974   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1975   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1976 }
1977 
1978 bool AMDGPULegalizerInfo::legalizeSinCos(
1979   MachineInstr &MI, MachineRegisterInfo &MRI,
1980   MachineIRBuilder &B) const {
1981 
1982   Register DstReg = MI.getOperand(0).getReg();
1983   Register SrcReg = MI.getOperand(1).getReg();
1984   LLT Ty = MRI.getType(DstReg);
1985   unsigned Flags = MI.getFlags();
1986 
1987   Register TrigVal;
1988   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1989   if (ST.hasTrigReducedRange()) {
1990     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1991     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1992       .addUse(MulVal.getReg(0))
1993       .setMIFlags(Flags).getReg(0);
1994   } else
1995     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1996 
1997   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1998     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1999   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2000     .addUse(TrigVal)
2001     .setMIFlags(Flags);
2002   MI.eraseFromParent();
2003   return true;
2004 }
2005 
2006 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2007                                                   MachineIRBuilder &B,
2008                                                   const GlobalValue *GV,
2009                                                   int64_t Offset,
2010                                                   unsigned GAFlags) const {
2011   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2012   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2013   // to the following code sequence:
2014   //
2015   // For constant address space:
2016   //   s_getpc_b64 s[0:1]
2017   //   s_add_u32 s0, s0, $symbol
2018   //   s_addc_u32 s1, s1, 0
2019   //
2020   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2021   //   a fixup or relocation is emitted to replace $symbol with a literal
2022   //   constant, which is a pc-relative offset from the encoding of the $symbol
2023   //   operand to the global variable.
2024   //
2025   // For global address space:
2026   //   s_getpc_b64 s[0:1]
2027   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2028   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2029   //
2030   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2031   //   fixups or relocations are emitted to replace $symbol@*@lo and
2032   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2033   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2034   //   operand to the global variable.
2035   //
2036   // What we want here is an offset from the value returned by s_getpc
2037   // (which is the address of the s_add_u32 instruction) to the global
2038   // variable, but since the encoding of $symbol starts 4 bytes after the start
2039   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2040   // small. This requires us to add 4 to the global variable offset in order to
2041   // compute the correct address.
2042 
2043   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2044 
2045   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2046     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2047 
2048   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2049     .addDef(PCReg);
2050 
2051   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2052   if (GAFlags == SIInstrInfo::MO_NONE)
2053     MIB.addImm(0);
2054   else
2055     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2056 
2057   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2058 
2059   if (PtrTy.getSizeInBits() == 32)
2060     B.buildExtract(DstReg, PCReg, 0);
2061   return true;
2062  }
2063 
2064 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2065   MachineInstr &MI, MachineRegisterInfo &MRI,
2066   MachineIRBuilder &B) const {
2067   Register DstReg = MI.getOperand(0).getReg();
2068   LLT Ty = MRI.getType(DstReg);
2069   unsigned AS = Ty.getAddressSpace();
2070 
2071   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2072   MachineFunction &MF = B.getMF();
2073   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2074 
2075   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2076     if (!MFI->isEntryFunction()) {
2077       const Function &Fn = MF.getFunction();
2078       DiagnosticInfoUnsupported BadLDSDecl(
2079         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2080         DS_Warning);
2081       Fn.getContext().diagnose(BadLDSDecl);
2082 
2083       // We currently don't have a way to correctly allocate LDS objects that
2084       // aren't directly associated with a kernel. We do force inlining of
2085       // functions that use local objects. However, if these dead functions are
2086       // not eliminated, we don't want a compile time error. Just emit a warning
2087       // and a trap, since there should be no callable path here.
2088       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2089       B.buildUndef(DstReg);
2090       MI.eraseFromParent();
2091       return true;
2092     }
2093 
2094     // TODO: We could emit code to handle the initialization somewhere.
2095     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2096       const SITargetLowering *TLI = ST.getTargetLowering();
2097       if (!TLI->shouldUseLDSConstAddress(GV)) {
2098         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2099         return true; // Leave in place;
2100       }
2101 
2102       B.buildConstant(
2103           DstReg,
2104           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2105       MI.eraseFromParent();
2106       return true;
2107     }
2108 
2109     const Function &Fn = MF.getFunction();
2110     DiagnosticInfoUnsupported BadInit(
2111       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2112     Fn.getContext().diagnose(BadInit);
2113     return true;
2114   }
2115 
2116   const SITargetLowering *TLI = ST.getTargetLowering();
2117 
2118   if (TLI->shouldEmitFixup(GV)) {
2119     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2120     MI.eraseFromParent();
2121     return true;
2122   }
2123 
2124   if (TLI->shouldEmitPCReloc(GV)) {
2125     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2126     MI.eraseFromParent();
2127     return true;
2128   }
2129 
2130   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2131   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2132 
2133   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2134       MachinePointerInfo::getGOT(MF),
2135       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2136           MachineMemOperand::MOInvariant,
2137       8 /*Size*/, Align(8));
2138 
2139   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2140 
2141   if (Ty.getSizeInBits() == 32) {
2142     // Truncate if this is a 32-bit constant adrdess.
2143     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2144     B.buildExtract(DstReg, Load, 0);
2145   } else
2146     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2147 
2148   MI.eraseFromParent();
2149   return true;
2150 }
2151 
2152 bool AMDGPULegalizerInfo::legalizeLoad(
2153   MachineInstr &MI, MachineRegisterInfo &MRI,
2154   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2155   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2156   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2157   Observer.changingInstr(MI);
2158   MI.getOperand(1).setReg(Cast.getReg(0));
2159   Observer.changedInstr(MI);
2160   return true;
2161 }
2162 
2163 bool AMDGPULegalizerInfo::legalizeFMad(
2164   MachineInstr &MI, MachineRegisterInfo &MRI,
2165   MachineIRBuilder &B) const {
2166   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2167   assert(Ty.isScalar());
2168 
2169   MachineFunction &MF = B.getMF();
2170   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2171 
2172   // TODO: Always legal with future ftz flag.
2173   // FIXME: Do we need just output?
2174   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2175     return true;
2176   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2177     return true;
2178 
2179   MachineIRBuilder HelperBuilder(MI);
2180   GISelObserverWrapper DummyObserver;
2181   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2182   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2183 }
2184 
2185 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2186   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2187   Register DstReg = MI.getOperand(0).getReg();
2188   Register PtrReg = MI.getOperand(1).getReg();
2189   Register CmpVal = MI.getOperand(2).getReg();
2190   Register NewVal = MI.getOperand(3).getReg();
2191 
2192   assert(SITargetLowering::isFlatGlobalAddrSpace(
2193            MRI.getType(PtrReg).getAddressSpace()) &&
2194          "this should not have been custom lowered");
2195 
2196   LLT ValTy = MRI.getType(CmpVal);
2197   LLT VecTy = LLT::vector(2, ValTy);
2198 
2199   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2200 
2201   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2202     .addDef(DstReg)
2203     .addUse(PtrReg)
2204     .addUse(PackedVal)
2205     .setMemRefs(MI.memoperands());
2206 
2207   MI.eraseFromParent();
2208   return true;
2209 }
2210 
2211 bool AMDGPULegalizerInfo::legalizeFlog(
2212   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2213   Register Dst = MI.getOperand(0).getReg();
2214   Register Src = MI.getOperand(1).getReg();
2215   LLT Ty = B.getMRI()->getType(Dst);
2216   unsigned Flags = MI.getFlags();
2217 
2218   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2219   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2220 
2221   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2222   MI.eraseFromParent();
2223   return true;
2224 }
2225 
2226 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2227                                        MachineIRBuilder &B) const {
2228   Register Dst = MI.getOperand(0).getReg();
2229   Register Src = MI.getOperand(1).getReg();
2230   unsigned Flags = MI.getFlags();
2231   LLT Ty = B.getMRI()->getType(Dst);
2232 
2233   auto K = B.buildFConstant(Ty, numbers::log2e);
2234   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2235   B.buildFExp2(Dst, Mul, Flags);
2236   MI.eraseFromParent();
2237   return true;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2241                                        MachineIRBuilder &B) const {
2242   Register Dst = MI.getOperand(0).getReg();
2243   Register Src0 = MI.getOperand(1).getReg();
2244   Register Src1 = MI.getOperand(2).getReg();
2245   unsigned Flags = MI.getFlags();
2246   LLT Ty = B.getMRI()->getType(Dst);
2247   const LLT S16 = LLT::scalar(16);
2248   const LLT S32 = LLT::scalar(32);
2249 
2250   if (Ty == S32) {
2251     auto Log = B.buildFLog2(S32, Src0, Flags);
2252     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2253       .addUse(Log.getReg(0))
2254       .addUse(Src1)
2255       .setMIFlags(Flags);
2256     B.buildFExp2(Dst, Mul, Flags);
2257   } else if (Ty == S16) {
2258     // There's no f16 fmul_legacy, so we need to convert for it.
2259     auto Log = B.buildFLog2(S16, Src0, Flags);
2260     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2261     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2262     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2263       .addUse(Ext0.getReg(0))
2264       .addUse(Ext1.getReg(0))
2265       .setMIFlags(Flags);
2266 
2267     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2268   } else
2269     return false;
2270 
2271   MI.eraseFromParent();
2272   return true;
2273 }
2274 
2275 // Find a source register, ignoring any possible source modifiers.
2276 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2277   Register ModSrc = OrigSrc;
2278   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2279     ModSrc = SrcFNeg->getOperand(1).getReg();
2280     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2281       ModSrc = SrcFAbs->getOperand(1).getReg();
2282   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2283     ModSrc = SrcFAbs->getOperand(1).getReg();
2284   return ModSrc;
2285 }
2286 
2287 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2288                                          MachineRegisterInfo &MRI,
2289                                          MachineIRBuilder &B) const {
2290 
2291   const LLT S1 = LLT::scalar(1);
2292   const LLT S64 = LLT::scalar(64);
2293   Register Dst = MI.getOperand(0).getReg();
2294   Register OrigSrc = MI.getOperand(1).getReg();
2295   unsigned Flags = MI.getFlags();
2296   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2297          "this should not have been custom lowered");
2298 
2299   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2300   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2301   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2302   // V_FRACT bug is:
2303   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2304   //
2305   // Convert floor(x) to (x - fract(x))
2306 
2307   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2308     .addUse(OrigSrc)
2309     .setMIFlags(Flags);
2310 
2311   // Give source modifier matching some assistance before obscuring a foldable
2312   // pattern.
2313 
2314   // TODO: We can avoid the neg on the fract? The input sign to fract
2315   // shouldn't matter?
2316   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2317 
2318   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2319 
2320   Register Min = MRI.createGenericVirtualRegister(S64);
2321 
2322   // We don't need to concern ourselves with the snan handling difference, so
2323   // use the one which will directly select.
2324   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2325   if (MFI->getMode().IEEE)
2326     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2327   else
2328     B.buildFMinNum(Min, Fract, Const, Flags);
2329 
2330   Register CorrectedFract = Min;
2331   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2332     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2333     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2334   }
2335 
2336   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2337   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2338 
2339   MI.eraseFromParent();
2340   return true;
2341 }
2342 
2343 // Turn an illegal packed v2s16 build vector into bit operations.
2344 // TODO: This should probably be a bitcast action in LegalizerHelper.
2345 bool AMDGPULegalizerInfo::legalizeBuildVector(
2346   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2347   Register Dst = MI.getOperand(0).getReg();
2348   const LLT S32 = LLT::scalar(32);
2349   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2350 
2351   Register Src0 = MI.getOperand(1).getReg();
2352   Register Src1 = MI.getOperand(2).getReg();
2353   assert(MRI.getType(Src0) == LLT::scalar(16));
2354 
2355   auto Merge = B.buildMerge(S32, {Src0, Src1});
2356   B.buildBitcast(Dst, Merge);
2357 
2358   MI.eraseFromParent();
2359   return true;
2360 }
2361 
2362 // Return the use branch instruction, otherwise null if the usage is invalid.
2363 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2364                                        MachineRegisterInfo &MRI,
2365                                        MachineInstr *&Br,
2366                                        MachineBasicBlock *&UncondBrTarget) {
2367   Register CondDef = MI.getOperand(0).getReg();
2368   if (!MRI.hasOneNonDBGUse(CondDef))
2369     return nullptr;
2370 
2371   MachineBasicBlock *Parent = MI.getParent();
2372   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2373   if (UseMI.getParent() != Parent ||
2374       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2375     return nullptr;
2376 
2377   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2378   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2379   if (Next == Parent->end()) {
2380     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2381     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2382       return nullptr;
2383     UncondBrTarget = &*NextMBB;
2384   } else {
2385     if (Next->getOpcode() != AMDGPU::G_BR)
2386       return nullptr;
2387     Br = &*Next;
2388     UncondBrTarget = Br->getOperand(0).getMBB();
2389   }
2390 
2391   return &UseMI;
2392 }
2393 
2394 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2395                                                MachineRegisterInfo &MRI,
2396                                                Register LiveIn,
2397                                                Register PhyReg) const {
2398   assert(PhyReg.isPhysical() && "Physical register expected");
2399 
2400   // Insert the live-in copy, if required, by defining destination virtual
2401   // register.
2402   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2403   if (!MRI.getVRegDef(LiveIn)) {
2404     // FIXME: Should have scoped insert pt
2405     MachineBasicBlock &OrigInsBB = B.getMBB();
2406     auto OrigInsPt = B.getInsertPt();
2407 
2408     MachineBasicBlock &EntryMBB = B.getMF().front();
2409     EntryMBB.addLiveIn(PhyReg);
2410     B.setInsertPt(EntryMBB, EntryMBB.begin());
2411     B.buildCopy(LiveIn, PhyReg);
2412 
2413     B.setInsertPt(OrigInsBB, OrigInsPt);
2414   }
2415 
2416   return LiveIn;
2417 }
2418 
2419 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2420                                                 MachineRegisterInfo &MRI,
2421                                                 Register PhyReg, LLT Ty,
2422                                                 bool InsertLiveInCopy) const {
2423   assert(PhyReg.isPhysical() && "Physical register expected");
2424 
2425   // Get or create virtual live-in regester
2426   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2427   if (!LiveIn) {
2428     LiveIn = MRI.createGenericVirtualRegister(Ty);
2429     MRI.addLiveIn(PhyReg, LiveIn);
2430   }
2431 
2432   // When the actual true copy required is from virtual register to physical
2433   // register (to be inserted later), live-in copy insertion from physical
2434   // to register virtual register is not required
2435   if (!InsertLiveInCopy)
2436     return LiveIn;
2437 
2438   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2439 }
2440 
2441 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2442     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2443   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2444   const ArgDescriptor *Arg;
2445   const TargetRegisterClass *RC;
2446   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2447   if (!Arg) {
2448     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2449     return nullptr;
2450   }
2451   return Arg;
2452 }
2453 
2454 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2455                                          const ArgDescriptor *Arg) const {
2456   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2457     return false; // TODO: Handle these
2458 
2459   Register SrcReg = Arg->getRegister();
2460   assert(SrcReg.isPhysical() && "Physical register expected");
2461   assert(DstReg.isVirtual() && "Virtual register expected");
2462 
2463   MachineRegisterInfo &MRI = *B.getMRI();
2464 
2465   LLT Ty = MRI.getType(DstReg);
2466   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2467 
2468   if (Arg->isMasked()) {
2469     // TODO: Should we try to emit this once in the entry block?
2470     const LLT S32 = LLT::scalar(32);
2471     const unsigned Mask = Arg->getMask();
2472     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2473 
2474     Register AndMaskSrc = LiveIn;
2475 
2476     if (Shift != 0) {
2477       auto ShiftAmt = B.buildConstant(S32, Shift);
2478       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2479     }
2480 
2481     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2482   } else {
2483     B.buildCopy(DstReg, LiveIn);
2484   }
2485 
2486   return true;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2490     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2491     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2492 
2493   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2494   if (!Arg)
2495     return false;
2496 
2497   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2498     return false;
2499 
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2505                                        MachineRegisterInfo &MRI,
2506                                        MachineIRBuilder &B) const {
2507   Register Dst = MI.getOperand(0).getReg();
2508   LLT DstTy = MRI.getType(Dst);
2509   LLT S16 = LLT::scalar(16);
2510   LLT S32 = LLT::scalar(32);
2511   LLT S64 = LLT::scalar(64);
2512 
2513   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2514     return true;
2515 
2516   if (DstTy == S16)
2517     return legalizeFDIV16(MI, MRI, B);
2518   if (DstTy == S32)
2519     return legalizeFDIV32(MI, MRI, B);
2520   if (DstTy == S64)
2521     return legalizeFDIV64(MI, MRI, B);
2522 
2523   return false;
2524 }
2525 
2526 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2527   const LLT S32 = LLT::scalar(32);
2528 
2529   auto Cvt0 = B.buildUITOFP(S32, Src);
2530   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2531   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2532   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2533   return B.buildFPTOUI(S32, Mul).getReg(0);
2534 }
2535 
2536 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2537                                                   Register DstReg,
2538                                                   Register Num,
2539                                                   Register Den,
2540                                                   bool IsDiv) const {
2541   const LLT S1 = LLT::scalar(1);
2542   const LLT S32 = LLT::scalar(32);
2543 
2544   // RCP =  URECIP(Den) = 2^32 / Den + e
2545   // e is rounding error.
2546   auto RCP = buildDivRCP(B, Den);
2547 
2548   // RCP_LO = mul(RCP, Den)
2549   auto RCP_LO = B.buildMul(S32, RCP, Den);
2550 
2551   // RCP_HI = mulhu (RCP, Den) */
2552   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2553 
2554   // NEG_RCP_LO = -RCP_LO
2555   auto Zero = B.buildConstant(S32, 0);
2556   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2557 
2558   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2559   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2560   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2561 
2562   // Calculate the rounding error from the URECIP instruction
2563   // E = mulhu(ABS_RCP_LO, RCP)
2564   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2565 
2566   // RCP_A_E = RCP + E
2567   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2568 
2569   // RCP_S_E = RCP - E
2570   auto RCP_S_E = B.buildSub(S32, RCP, E);
2571 
2572   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2573   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2574 
2575   // Quotient = mulhu(Tmp0, Num)stmp
2576   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2577 
2578   // Num_S_Remainder = Quotient * Den
2579   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2580 
2581   // Remainder = Num - Num_S_Remainder
2582   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2583 
2584   // Remainder_GE_Den = Remainder >= Den
2585   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2586 
2587   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2588   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2589                                        Num, Num_S_Remainder);
2590 
2591   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2592   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2593 
2594   // Calculate Division result:
2595 
2596   // Quotient_A_One = Quotient + 1
2597   auto One = B.buildConstant(S32, 1);
2598   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2599 
2600   // Quotient_S_One = Quotient - 1
2601   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2602 
2603   // Div = (Tmp1 ? Quotient_A_One : Quotient)
2604   auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient);
2605 
2606   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2607   if (IsDiv) {
2608     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2609   } else {
2610     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2611 
2612     // Calculate Rem result:
2613     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2614 
2615     // Remainder_A_Den = Remainder + Den
2616     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2617 
2618     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2619     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2620 
2621     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2622     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2623   }
2624 }
2625 
2626 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2627                                               MachineRegisterInfo &MRI,
2628                                               MachineIRBuilder &B) const {
2629   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2630   Register DstReg = MI.getOperand(0).getReg();
2631   Register Num = MI.getOperand(1).getReg();
2632   Register Den = MI.getOperand(2).getReg();
2633   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2634   MI.eraseFromParent();
2635   return true;
2636 }
2637 
2638 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2639 //
2640 // Return lo, hi of result
2641 //
2642 // %cvt.lo = G_UITOFP Val.lo
2643 // %cvt.hi = G_UITOFP Val.hi
2644 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2645 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2646 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2647 // %mul2 = G_FMUL %mul1, 2**(-32)
2648 // %trunc = G_INTRINSIC_TRUNC %mul2
2649 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2650 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2651 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2652                                                        Register Val) {
2653   const LLT S32 = LLT::scalar(32);
2654   auto Unmerge = B.buildUnmerge(S32, Val);
2655 
2656   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2657   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2658 
2659   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2660                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2661 
2662   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2663   auto Mul1 =
2664       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2665 
2666   // 2**(-32)
2667   auto Mul2 =
2668       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2669   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2670 
2671   // -(2**32)
2672   auto Mad2 = B.buildFMAD(S32, Trunc,
2673                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2674 
2675   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2676   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2677 
2678   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2679 }
2680 
2681 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2682                                                   Register DstReg,
2683                                                   Register Numer,
2684                                                   Register Denom,
2685                                                   bool IsDiv) const {
2686   const LLT S32 = LLT::scalar(32);
2687   const LLT S64 = LLT::scalar(64);
2688   const LLT S1 = LLT::scalar(1);
2689   Register RcpLo, RcpHi;
2690 
2691   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2692 
2693   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2694 
2695   auto Zero64 = B.buildConstant(S64, 0);
2696   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2697 
2698   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2699   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2700 
2701   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2702   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2703   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2704 
2705   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2706   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2707   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2708   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2709 
2710   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2711   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2712   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2713   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2714   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2715 
2716   auto Zero32 = B.buildConstant(S32, 0);
2717   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2718   auto Add2_HiC =
2719       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2720   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2721   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2722 
2723   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2724   Register NumerLo = UnmergeNumer.getReg(0);
2725   Register NumerHi = UnmergeNumer.getReg(1);
2726 
2727   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2728   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2729   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2730   Register Mul3_Lo = UnmergeMul3.getReg(0);
2731   Register Mul3_Hi = UnmergeMul3.getReg(1);
2732   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2733   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2734   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2735   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2736 
2737   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2738   Register DenomLo = UnmergeDenom.getReg(0);
2739   Register DenomHi = UnmergeDenom.getReg(1);
2740 
2741   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2742   auto C1 = B.buildSExt(S32, CmpHi);
2743 
2744   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2745   auto C2 = B.buildSExt(S32, CmpLo);
2746 
2747   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2748   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2749 
2750   // TODO: Here and below portions of the code can be enclosed into if/endif.
2751   // Currently control flow is unconditional and we have 4 selects after
2752   // potential endif to substitute PHIs.
2753 
2754   // if C3 != 0 ...
2755   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2756   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2757   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2758   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2759 
2760   auto One64 = B.buildConstant(S64, 1);
2761   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2762 
2763   auto C4 =
2764       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2765   auto C5 =
2766       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2767   auto C6 = B.buildSelect(
2768       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2769 
2770   // if (C6 != 0)
2771   auto Add4 = B.buildAdd(S64, Add3, One64);
2772   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2773 
2774   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2775   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2776   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2777 
2778   // endif C6
2779   // endif C3
2780 
2781   if (IsDiv) {
2782     auto Sel1 = B.buildSelect(
2783         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2784     B.buildSelect(DstReg,
2785                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2786   } else {
2787     auto Sel2 = B.buildSelect(
2788         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2789     B.buildSelect(DstReg,
2790                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2791   }
2792 }
2793 
2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2795                                             MachineRegisterInfo &MRI,
2796                                             MachineIRBuilder &B) const {
2797   const LLT S64 = LLT::scalar(64);
2798   const LLT S32 = LLT::scalar(32);
2799   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2800   Register DstReg = MI.getOperand(0).getReg();
2801   Register Num = MI.getOperand(1).getReg();
2802   Register Den = MI.getOperand(2).getReg();
2803   LLT Ty = MRI.getType(DstReg);
2804 
2805   if (Ty == S32)
2806     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2807   else if (Ty == S64)
2808     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2809   else
2810     return false;
2811 
2812   MI.eraseFromParent();
2813   return true;
2814 
2815 }
2816 
2817 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2818                                             MachineRegisterInfo &MRI,
2819                                             MachineIRBuilder &B) const {
2820   const LLT S64 = LLT::scalar(64);
2821   const LLT S32 = LLT::scalar(32);
2822 
2823   Register DstReg = MI.getOperand(0).getReg();
2824   const LLT Ty = MRI.getType(DstReg);
2825   if (Ty != S32 && Ty != S64)
2826     return false;
2827 
2828   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2829 
2830   Register LHS = MI.getOperand(1).getReg();
2831   Register RHS = MI.getOperand(2).getReg();
2832 
2833   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2834   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2835   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2836 
2837   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2838   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2839 
2840   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2841   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2842 
2843   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2844   if (Ty == S32)
2845     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2846   else
2847     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2848 
2849   Register Sign;
2850   if (IsDiv)
2851     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2852   else
2853     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2854 
2855   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2856   B.buildSub(DstReg, UDivRem, Sign);
2857 
2858   MI.eraseFromParent();
2859   return true;
2860 }
2861 
2862 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2863                                                  MachineRegisterInfo &MRI,
2864                                                  MachineIRBuilder &B) const {
2865   Register Res = MI.getOperand(0).getReg();
2866   Register LHS = MI.getOperand(1).getReg();
2867   Register RHS = MI.getOperand(2).getReg();
2868 
2869   uint16_t Flags = MI.getFlags();
2870 
2871   LLT ResTy = MRI.getType(Res);
2872   LLT S32 = LLT::scalar(32);
2873   LLT S64 = LLT::scalar(64);
2874 
2875   const MachineFunction &MF = B.getMF();
2876   bool Unsafe =
2877     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2878 
2879   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2880     return false;
2881 
2882   if (!Unsafe && ResTy == S32 &&
2883       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2884     return false;
2885 
2886   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2887     // 1 / x -> RCP(x)
2888     if (CLHS->isExactlyValue(1.0)) {
2889       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2890         .addUse(RHS)
2891         .setMIFlags(Flags);
2892 
2893       MI.eraseFromParent();
2894       return true;
2895     }
2896 
2897     // -1 / x -> RCP( FNEG(x) )
2898     if (CLHS->isExactlyValue(-1.0)) {
2899       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2900       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2901         .addUse(FNeg.getReg(0))
2902         .setMIFlags(Flags);
2903 
2904       MI.eraseFromParent();
2905       return true;
2906     }
2907   }
2908 
2909   // x / y -> x * (1.0 / y)
2910   if (Unsafe) {
2911     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2912       .addUse(RHS)
2913       .setMIFlags(Flags);
2914     B.buildFMul(Res, LHS, RCP, Flags);
2915 
2916     MI.eraseFromParent();
2917     return true;
2918   }
2919 
2920   return false;
2921 }
2922 
2923 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2924                                          MachineRegisterInfo &MRI,
2925                                          MachineIRBuilder &B) const {
2926   Register Res = MI.getOperand(0).getReg();
2927   Register LHS = MI.getOperand(1).getReg();
2928   Register RHS = MI.getOperand(2).getReg();
2929 
2930   uint16_t Flags = MI.getFlags();
2931 
2932   LLT S16 = LLT::scalar(16);
2933   LLT S32 = LLT::scalar(32);
2934 
2935   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2936   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2937 
2938   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2939     .addUse(RHSExt.getReg(0))
2940     .setMIFlags(Flags);
2941 
2942   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2943   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2944 
2945   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2946     .addUse(RDst.getReg(0))
2947     .addUse(RHS)
2948     .addUse(LHS)
2949     .setMIFlags(Flags);
2950 
2951   MI.eraseFromParent();
2952   return true;
2953 }
2954 
2955 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2956 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2957 static void toggleSPDenormMode(bool Enable,
2958                                MachineIRBuilder &B,
2959                                const GCNSubtarget &ST,
2960                                AMDGPU::SIModeRegisterDefaults Mode) {
2961   // Set SP denorm mode to this value.
2962   unsigned SPDenormMode =
2963     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2964 
2965   if (ST.hasDenormModeInst()) {
2966     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2967     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2968 
2969     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2970     B.buildInstr(AMDGPU::S_DENORM_MODE)
2971       .addImm(NewDenormModeValue);
2972 
2973   } else {
2974     // Select FP32 bit field in mode register.
2975     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2976                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2977                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2978 
2979     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2980       .addImm(SPDenormMode)
2981       .addImm(SPDenormModeBitField);
2982   }
2983 }
2984 
2985 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2986                                          MachineRegisterInfo &MRI,
2987                                          MachineIRBuilder &B) const {
2988   Register Res = MI.getOperand(0).getReg();
2989   Register LHS = MI.getOperand(1).getReg();
2990   Register RHS = MI.getOperand(2).getReg();
2991   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2992   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2993 
2994   uint16_t Flags = MI.getFlags();
2995 
2996   LLT S32 = LLT::scalar(32);
2997   LLT S1 = LLT::scalar(1);
2998 
2999   auto One = B.buildFConstant(S32, 1.0f);
3000 
3001   auto DenominatorScaled =
3002     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3003       .addUse(LHS)
3004       .addUse(RHS)
3005       .addImm(0)
3006       .setMIFlags(Flags);
3007   auto NumeratorScaled =
3008     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3009       .addUse(LHS)
3010       .addUse(RHS)
3011       .addImm(1)
3012       .setMIFlags(Flags);
3013 
3014   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3015     .addUse(DenominatorScaled.getReg(0))
3016     .setMIFlags(Flags);
3017   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3018 
3019   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3020   // aren't modeled as reading it.
3021   if (!Mode.allFP32Denormals())
3022     toggleSPDenormMode(true, B, ST, Mode);
3023 
3024   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3025   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3026   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3027   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3028   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3029   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3030 
3031   if (!Mode.allFP32Denormals())
3032     toggleSPDenormMode(false, B, ST, Mode);
3033 
3034   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3035     .addUse(Fma4.getReg(0))
3036     .addUse(Fma1.getReg(0))
3037     .addUse(Fma3.getReg(0))
3038     .addUse(NumeratorScaled.getReg(1))
3039     .setMIFlags(Flags);
3040 
3041   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3042     .addUse(Fmas.getReg(0))
3043     .addUse(RHS)
3044     .addUse(LHS)
3045     .setMIFlags(Flags);
3046 
3047   MI.eraseFromParent();
3048   return true;
3049 }
3050 
3051 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3052                                          MachineRegisterInfo &MRI,
3053                                          MachineIRBuilder &B) const {
3054   Register Res = MI.getOperand(0).getReg();
3055   Register LHS = MI.getOperand(1).getReg();
3056   Register RHS = MI.getOperand(2).getReg();
3057 
3058   uint16_t Flags = MI.getFlags();
3059 
3060   LLT S64 = LLT::scalar(64);
3061   LLT S1 = LLT::scalar(1);
3062 
3063   auto One = B.buildFConstant(S64, 1.0);
3064 
3065   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3066     .addUse(LHS)
3067     .addUse(RHS)
3068     .addImm(0)
3069     .setMIFlags(Flags);
3070 
3071   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3072 
3073   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3074     .addUse(DivScale0.getReg(0))
3075     .setMIFlags(Flags);
3076 
3077   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3078   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3079   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3080 
3081   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3082     .addUse(LHS)
3083     .addUse(RHS)
3084     .addImm(1)
3085     .setMIFlags(Flags);
3086 
3087   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3088   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3089   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3090 
3091   Register Scale;
3092   if (!ST.hasUsableDivScaleConditionOutput()) {
3093     // Workaround a hardware bug on SI where the condition output from div_scale
3094     // is not usable.
3095 
3096     LLT S32 = LLT::scalar(32);
3097 
3098     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3099     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3100     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3101     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3102 
3103     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3104                               Scale1Unmerge.getReg(1));
3105     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3106                               Scale0Unmerge.getReg(1));
3107     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3108   } else {
3109     Scale = DivScale1.getReg(1);
3110   }
3111 
3112   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3113     .addUse(Fma4.getReg(0))
3114     .addUse(Fma3.getReg(0))
3115     .addUse(Mul.getReg(0))
3116     .addUse(Scale)
3117     .setMIFlags(Flags);
3118 
3119   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3120     .addUse(Fmas.getReg(0))
3121     .addUse(RHS)
3122     .addUse(LHS)
3123     .setMIFlags(Flags);
3124 
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3130                                                  MachineRegisterInfo &MRI,
3131                                                  MachineIRBuilder &B) const {
3132   Register Res = MI.getOperand(0).getReg();
3133   Register LHS = MI.getOperand(2).getReg();
3134   Register RHS = MI.getOperand(3).getReg();
3135   uint16_t Flags = MI.getFlags();
3136 
3137   LLT S32 = LLT::scalar(32);
3138   LLT S1 = LLT::scalar(1);
3139 
3140   auto Abs = B.buildFAbs(S32, RHS, Flags);
3141   const APFloat C0Val(1.0f);
3142 
3143   auto C0 = B.buildConstant(S32, 0x6f800000);
3144   auto C1 = B.buildConstant(S32, 0x2f800000);
3145   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3146 
3147   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3148   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3149 
3150   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3151 
3152   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3153     .addUse(Mul0.getReg(0))
3154     .setMIFlags(Flags);
3155 
3156   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3157 
3158   B.buildFMul(Res, Sel, Mul1, Flags);
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3165                                                  MachineRegisterInfo &MRI,
3166                                                  MachineIRBuilder &B) const {
3167   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3168   if (!MFI->isEntryFunction()) {
3169     return legalizePreloadedArgIntrin(MI, MRI, B,
3170                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3171   }
3172 
3173   uint64_t Offset =
3174     ST.getTargetLowering()->getImplicitParameterOffset(
3175       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3176   Register DstReg = MI.getOperand(0).getReg();
3177   LLT DstTy = MRI.getType(DstReg);
3178   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3179 
3180   const ArgDescriptor *Arg;
3181   const TargetRegisterClass *RC;
3182   std::tie(Arg, RC)
3183     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3184   if (!Arg)
3185     return false;
3186 
3187   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3188   if (!loadInputValue(KernargPtrReg, B, Arg))
3189     return false;
3190 
3191   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3192   MI.eraseFromParent();
3193   return true;
3194 }
3195 
3196 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3197                                               MachineRegisterInfo &MRI,
3198                                               MachineIRBuilder &B,
3199                                               unsigned AddrSpace) const {
3200   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3201   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3202   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3203   MI.eraseFromParent();
3204   return true;
3205 }
3206 
3207 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3208 // offset (the offset that is included in bounds checking and swizzling, to be
3209 // split between the instruction's voffset and immoffset fields) and soffset
3210 // (the offset that is excluded from bounds checking and swizzling, to go in
3211 // the instruction's soffset field).  This function takes the first kind of
3212 // offset and figures out how to split it between voffset and immoffset.
3213 std::tuple<Register, unsigned, unsigned>
3214 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3215                                         Register OrigOffset) const {
3216   const unsigned MaxImm = 4095;
3217   Register BaseReg;
3218   unsigned TotalConstOffset;
3219   MachineInstr *OffsetDef;
3220   const LLT S32 = LLT::scalar(32);
3221 
3222   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3223     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3224 
3225   unsigned ImmOffset = TotalConstOffset;
3226 
3227   // If the immediate value is too big for the immoffset field, put the value
3228   // and -4096 into the immoffset field so that the value that is copied/added
3229   // for the voffset field is a multiple of 4096, and it stands more chance
3230   // of being CSEd with the copy/add for another similar load/store.
3231   // However, do not do that rounding down to a multiple of 4096 if that is a
3232   // negative number, as it appears to be illegal to have a negative offset
3233   // in the vgpr, even if adding the immediate offset makes it positive.
3234   unsigned Overflow = ImmOffset & ~MaxImm;
3235   ImmOffset -= Overflow;
3236   if ((int32_t)Overflow < 0) {
3237     Overflow += ImmOffset;
3238     ImmOffset = 0;
3239   }
3240 
3241   if (Overflow != 0) {
3242     if (!BaseReg) {
3243       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3244     } else {
3245       auto OverflowVal = B.buildConstant(S32, Overflow);
3246       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3247     }
3248   }
3249 
3250   if (!BaseReg)
3251     BaseReg = B.buildConstant(S32, 0).getReg(0);
3252 
3253   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3254 }
3255 
3256 /// Handle register layout difference for f16 images for some subtargets.
3257 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3258                                              MachineRegisterInfo &MRI,
3259                                              Register Reg) const {
3260   if (!ST.hasUnpackedD16VMem())
3261     return Reg;
3262 
3263   const LLT S16 = LLT::scalar(16);
3264   const LLT S32 = LLT::scalar(32);
3265   LLT StoreVT = MRI.getType(Reg);
3266   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3267 
3268   auto Unmerge = B.buildUnmerge(S16, Reg);
3269 
3270   SmallVector<Register, 4> WideRegs;
3271   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3272     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3273 
3274   int NumElts = StoreVT.getNumElements();
3275 
3276   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3277 }
3278 
3279 Register AMDGPULegalizerInfo::fixStoreSourceType(
3280   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3281   MachineRegisterInfo *MRI = B.getMRI();
3282   LLT Ty = MRI->getType(VData);
3283 
3284   const LLT S16 = LLT::scalar(16);
3285 
3286   // Fixup illegal register types for i8 stores.
3287   if (Ty == LLT::scalar(8) || Ty == S16) {
3288     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3289     return AnyExt;
3290   }
3291 
3292   if (Ty.isVector()) {
3293     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3294       if (IsFormat)
3295         return handleD16VData(B, *MRI, VData);
3296     }
3297   }
3298 
3299   return VData;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3303                                               MachineRegisterInfo &MRI,
3304                                               MachineIRBuilder &B,
3305                                               bool IsTyped,
3306                                               bool IsFormat) const {
3307   Register VData = MI.getOperand(1).getReg();
3308   LLT Ty = MRI.getType(VData);
3309   LLT EltTy = Ty.getScalarType();
3310   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3311   const LLT S32 = LLT::scalar(32);
3312 
3313   VData = fixStoreSourceType(B, VData, IsFormat);
3314   Register RSrc = MI.getOperand(2).getReg();
3315 
3316   MachineMemOperand *MMO = *MI.memoperands_begin();
3317   const int MemSize = MMO->getSize();
3318 
3319   unsigned ImmOffset;
3320   unsigned TotalOffset;
3321 
3322   // The typed intrinsics add an immediate after the registers.
3323   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3324 
3325   // The struct intrinsic variants add one additional operand over raw.
3326   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3327   Register VIndex;
3328   int OpOffset = 0;
3329   if (HasVIndex) {
3330     VIndex = MI.getOperand(3).getReg();
3331     OpOffset = 1;
3332   }
3333 
3334   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3335   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3336 
3337   unsigned Format = 0;
3338   if (IsTyped) {
3339     Format = MI.getOperand(5 + OpOffset).getImm();
3340     ++OpOffset;
3341   }
3342 
3343   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3344 
3345   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3346   if (TotalOffset != 0)
3347     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3348 
3349   unsigned Opc;
3350   if (IsTyped) {
3351     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3352                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3353   } else if (IsFormat) {
3354     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3355                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3356   } else {
3357     switch (MemSize) {
3358     case 1:
3359       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3360       break;
3361     case 2:
3362       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3363       break;
3364     default:
3365       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3366       break;
3367     }
3368   }
3369 
3370   if (!VIndex)
3371     VIndex = B.buildConstant(S32, 0).getReg(0);
3372 
3373   auto MIB = B.buildInstr(Opc)
3374     .addUse(VData)              // vdata
3375     .addUse(RSrc)               // rsrc
3376     .addUse(VIndex)             // vindex
3377     .addUse(VOffset)            // voffset
3378     .addUse(SOffset)            // soffset
3379     .addImm(ImmOffset);         // offset(imm)
3380 
3381   if (IsTyped)
3382     MIB.addImm(Format);
3383 
3384   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3385      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3386      .addMemOperand(MMO);
3387 
3388   MI.eraseFromParent();
3389   return true;
3390 }
3391 
3392 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3393                                              MachineRegisterInfo &MRI,
3394                                              MachineIRBuilder &B,
3395                                              bool IsFormat,
3396                                              bool IsTyped) const {
3397   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3398   MachineMemOperand *MMO = *MI.memoperands_begin();
3399   const int MemSize = MMO->getSize();
3400   const LLT S32 = LLT::scalar(32);
3401 
3402   Register Dst = MI.getOperand(0).getReg();
3403   Register RSrc = MI.getOperand(2).getReg();
3404 
3405   // The typed intrinsics add an immediate after the registers.
3406   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3407 
3408   // The struct intrinsic variants add one additional operand over raw.
3409   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3410   Register VIndex;
3411   int OpOffset = 0;
3412   if (HasVIndex) {
3413     VIndex = MI.getOperand(3).getReg();
3414     OpOffset = 1;
3415   }
3416 
3417   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3418   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3419 
3420   unsigned Format = 0;
3421   if (IsTyped) {
3422     Format = MI.getOperand(5 + OpOffset).getImm();
3423     ++OpOffset;
3424   }
3425 
3426   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3427   unsigned ImmOffset;
3428   unsigned TotalOffset;
3429 
3430   LLT Ty = MRI.getType(Dst);
3431   LLT EltTy = Ty.getScalarType();
3432   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3433   const bool Unpacked = ST.hasUnpackedD16VMem();
3434 
3435   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3436   if (TotalOffset != 0)
3437     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3438 
3439   unsigned Opc;
3440 
3441   if (IsTyped) {
3442     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3443                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3444   } else if (IsFormat) {
3445     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3446                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3447   } else {
3448     switch (MemSize) {
3449     case 1:
3450       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3451       break;
3452     case 2:
3453       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3454       break;
3455     default:
3456       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3457       break;
3458     }
3459   }
3460 
3461   Register LoadDstReg;
3462 
3463   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3464   LLT UnpackedTy = Ty.changeElementSize(32);
3465 
3466   if (IsExtLoad)
3467     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3468   else if (Unpacked && IsD16 && Ty.isVector())
3469     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3470   else
3471     LoadDstReg = Dst;
3472 
3473   if (!VIndex)
3474     VIndex = B.buildConstant(S32, 0).getReg(0);
3475 
3476   auto MIB = B.buildInstr(Opc)
3477     .addDef(LoadDstReg)         // vdata
3478     .addUse(RSrc)               // rsrc
3479     .addUse(VIndex)             // vindex
3480     .addUse(VOffset)            // voffset
3481     .addUse(SOffset)            // soffset
3482     .addImm(ImmOffset);         // offset(imm)
3483 
3484   if (IsTyped)
3485     MIB.addImm(Format);
3486 
3487   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3488      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3489      .addMemOperand(MMO);
3490 
3491   if (LoadDstReg != Dst) {
3492     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3493 
3494     // Widen result for extending loads was widened.
3495     if (IsExtLoad)
3496       B.buildTrunc(Dst, LoadDstReg);
3497     else {
3498       // Repack to original 16-bit vector result
3499       // FIXME: G_TRUNC should work, but legalization currently fails
3500       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3501       SmallVector<Register, 4> Repack;
3502       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3503         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3504       B.buildMerge(Dst, Repack);
3505     }
3506   }
3507 
3508   MI.eraseFromParent();
3509   return true;
3510 }
3511 
3512 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3513                                                MachineIRBuilder &B,
3514                                                bool IsInc) const {
3515   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3516                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3517   B.buildInstr(Opc)
3518     .addDef(MI.getOperand(0).getReg())
3519     .addUse(MI.getOperand(2).getReg())
3520     .addUse(MI.getOperand(3).getReg())
3521     .cloneMemRefs(MI);
3522   MI.eraseFromParent();
3523   return true;
3524 }
3525 
3526 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3527   switch (IntrID) {
3528   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3529   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3530     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3531   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3532   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3533     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3534   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3535   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3536     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3537   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3538   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3539     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3540   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3541   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3542     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3543   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3544   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3545     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3546   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3547   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3548     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3549   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3550   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3551     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3552   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3553   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3554     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3555   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3556   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3557     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3558   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3559   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3560     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3561   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3562   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3563     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3564   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3565   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3566     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3567   default:
3568     llvm_unreachable("unhandled atomic opcode");
3569   }
3570 }
3571 
3572 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3573                                                MachineIRBuilder &B,
3574                                                Intrinsic::ID IID) const {
3575   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3576                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3577 
3578   Register Dst = MI.getOperand(0).getReg();
3579   Register VData = MI.getOperand(2).getReg();
3580 
3581   Register CmpVal;
3582   int OpOffset = 0;
3583 
3584   if (IsCmpSwap) {
3585     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3586     ++OpOffset;
3587   }
3588 
3589   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3590   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3591 
3592   // The struct intrinsic variants add one additional operand over raw.
3593   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3594   Register VIndex;
3595   if (HasVIndex) {
3596     VIndex = MI.getOperand(4 + OpOffset).getReg();
3597     ++OpOffset;
3598   }
3599 
3600   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3601   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3602   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3603 
3604   MachineMemOperand *MMO = *MI.memoperands_begin();
3605 
3606   unsigned ImmOffset;
3607   unsigned TotalOffset;
3608   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3609   if (TotalOffset != 0)
3610     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3611 
3612   if (!VIndex)
3613     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3614 
3615   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3616     .addDef(Dst)
3617     .addUse(VData); // vdata
3618 
3619   if (IsCmpSwap)
3620     MIB.addReg(CmpVal);
3621 
3622   MIB.addUse(RSrc)               // rsrc
3623      .addUse(VIndex)             // vindex
3624      .addUse(VOffset)            // voffset
3625      .addUse(SOffset)            // soffset
3626      .addImm(ImmOffset)          // offset(imm)
3627      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3628      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3629      .addMemOperand(MMO);
3630 
3631   MI.eraseFromParent();
3632   return true;
3633 }
3634 
3635 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3636 /// vector with s16 typed elements.
3637 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3638                                         SmallVectorImpl<Register> &PackedAddrs,
3639                                         int AddrIdx, int DimIdx, int EndIdx,
3640                                         int NumGradients) {
3641   const LLT S16 = LLT::scalar(16);
3642   const LLT V2S16 = LLT::vector(2, 16);
3643 
3644   for (int I = AddrIdx; I < EndIdx; ++I) {
3645     MachineOperand &SrcOp = MI.getOperand(I);
3646     if (!SrcOp.isReg())
3647       continue; // _L to _LZ may have eliminated this.
3648 
3649     Register AddrReg = SrcOp.getReg();
3650 
3651     if (I < DimIdx) {
3652       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3653       PackedAddrs.push_back(AddrReg);
3654     } else {
3655       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3656       // derivatives dx/dh and dx/dv are packed with undef.
3657       if (((I + 1) >= EndIdx) ||
3658           ((NumGradients / 2) % 2 == 1 &&
3659            (I == DimIdx + (NumGradients / 2) - 1 ||
3660             I == DimIdx + NumGradients - 1)) ||
3661           // Check for _L to _LZ optimization
3662           !MI.getOperand(I + 1).isReg()) {
3663         PackedAddrs.push_back(
3664             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3665                 .getReg(0));
3666       } else {
3667         PackedAddrs.push_back(
3668             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3669                 .getReg(0));
3670         ++I;
3671       }
3672     }
3673   }
3674 }
3675 
3676 /// Convert from separate vaddr components to a single vector address register,
3677 /// and replace the remaining operands with $noreg.
3678 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3679                                      int DimIdx, int NumVAddrs) {
3680   const LLT S32 = LLT::scalar(32);
3681 
3682   SmallVector<Register, 8> AddrRegs;
3683   for (int I = 0; I != NumVAddrs; ++I) {
3684     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3685     if (SrcOp.isReg()) {
3686       AddrRegs.push_back(SrcOp.getReg());
3687       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3688     }
3689   }
3690 
3691   int NumAddrRegs = AddrRegs.size();
3692   if (NumAddrRegs != 1) {
3693     // Round up to 8 elements for v5-v7
3694     // FIXME: Missing intermediate sized register classes and instructions.
3695     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3696       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3697       auto Undef = B.buildUndef(S32);
3698       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3699       NumAddrRegs = RoundedNumRegs;
3700     }
3701 
3702     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3703     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3704   }
3705 
3706   for (int I = 1; I != NumVAddrs; ++I) {
3707     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3708     if (SrcOp.isReg())
3709       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3710   }
3711 }
3712 
3713 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3714 ///
3715 /// Depending on the subtarget, load/store with 16-bit element data need to be
3716 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3717 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3718 /// registers.
3719 ///
3720 /// We don't want to directly select image instructions just yet, but also want
3721 /// to exposes all register repacking to the legalizer/combiners. We also don't
3722 /// want a selected instrution entering RegBankSelect. In order to avoid
3723 /// defining a multitude of intermediate image instructions, directly hack on
3724 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3725 /// now unnecessary arguments with $noreg.
3726 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3727     MachineInstr &MI, MachineIRBuilder &B,
3728     GISelChangeObserver &Observer,
3729     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3730 
3731   const int NumDefs = MI.getNumExplicitDefs();
3732   bool IsTFE = NumDefs == 2;
3733   // We are only processing the operands of d16 image operations on subtargets
3734   // that use the unpacked register layout, or need to repack the TFE result.
3735 
3736   // TODO: Do we need to guard against already legalized intrinsics?
3737   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3738     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3739 
3740   MachineRegisterInfo *MRI = B.getMRI();
3741   const LLT S32 = LLT::scalar(32);
3742   const LLT S16 = LLT::scalar(16);
3743   const LLT V2S16 = LLT::vector(2, 16);
3744 
3745   // Index of first address argument
3746   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3747 
3748   int NumVAddrs, NumGradients;
3749   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3750   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3751     getDMaskIdx(BaseOpcode, NumDefs);
3752   unsigned DMask = 0;
3753 
3754   // Check for 16 bit addresses and pack if true.
3755   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3756   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3757   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3758   const bool IsG16 = GradTy == S16;
3759   const bool IsA16 = AddrTy == S16;
3760 
3761   int DMaskLanes = 0;
3762   if (!BaseOpcode->Atomic) {
3763     DMask = MI.getOperand(DMaskIdx).getImm();
3764     if (BaseOpcode->Gather4) {
3765       DMaskLanes = 4;
3766     } else if (DMask != 0) {
3767       DMaskLanes = countPopulation(DMask);
3768     } else if (!IsTFE && !BaseOpcode->Store) {
3769       // If dmask is 0, this is a no-op load. This can be eliminated.
3770       B.buildUndef(MI.getOperand(0));
3771       MI.eraseFromParent();
3772       return true;
3773     }
3774   }
3775 
3776   Observer.changingInstr(MI);
3777   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3778 
3779   unsigned NewOpcode = NumDefs == 0 ?
3780     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3781 
3782   // Track that we legalized this
3783   MI.setDesc(B.getTII().get(NewOpcode));
3784 
3785   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3786   // dmask to be at least 1 otherwise the instruction will fail
3787   if (IsTFE && DMask == 0) {
3788     DMask = 0x1;
3789     DMaskLanes = 1;
3790     MI.getOperand(DMaskIdx).setImm(DMask);
3791   }
3792 
3793   if (BaseOpcode->Atomic) {
3794     Register VData0 = MI.getOperand(2).getReg();
3795     LLT Ty = MRI->getType(VData0);
3796 
3797     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3798     if (Ty.isVector())
3799       return false;
3800 
3801     if (BaseOpcode->AtomicX2) {
3802       Register VData1 = MI.getOperand(3).getReg();
3803       // The two values are packed in one register.
3804       LLT PackedTy = LLT::vector(2, Ty);
3805       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3806       MI.getOperand(2).setReg(Concat.getReg(0));
3807       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3808     }
3809   }
3810 
3811   int CorrectedNumVAddrs = NumVAddrs;
3812 
3813   // Optimize _L to _LZ when _L is zero
3814   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3815         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3816     const ConstantFP *ConstantLod;
3817     const int LodIdx = AddrIdx + NumVAddrs - 1;
3818 
3819     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3820       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3821         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3822         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3823           LZMappingInfo->LZ, ImageDimIntr->Dim);
3824 
3825         // The starting indexes should remain in the same place.
3826         --NumVAddrs;
3827         --CorrectedNumVAddrs;
3828 
3829         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3830           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3831         MI.RemoveOperand(LodIdx);
3832       }
3833     }
3834   }
3835 
3836   // Optimize _mip away, when 'lod' is zero
3837   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3838     int64_t ConstantLod;
3839     const int LodIdx = AddrIdx + NumVAddrs - 1;
3840 
3841     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3842       if (ConstantLod == 0) {
3843         // TODO: Change intrinsic opcode and remove operand instead or replacing
3844         // it with 0, as the _L to _LZ handling is done above.
3845         MI.getOperand(LodIdx).ChangeToImmediate(0);
3846         --CorrectedNumVAddrs;
3847       }
3848     }
3849   }
3850 
3851   // Rewrite the addressing register layout before doing anything else.
3852   if (IsA16 || IsG16) {
3853     if (IsA16) {
3854       // Target must support the feature and gradients need to be 16 bit too
3855       if (!ST.hasA16() || !IsG16)
3856         return false;
3857     } else if (!ST.hasG16())
3858       return false;
3859 
3860     if (NumVAddrs > 1) {
3861       SmallVector<Register, 4> PackedRegs;
3862       // Don't compress addresses for G16
3863       const int PackEndIdx =
3864           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3865       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3866                                   PackEndIdx, NumGradients);
3867 
3868       if (!IsA16) {
3869         // Add uncompressed address
3870         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3871           int AddrReg = MI.getOperand(I).getReg();
3872           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3873           PackedRegs.push_back(AddrReg);
3874         }
3875       }
3876 
3877       // See also below in the non-a16 branch
3878       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3879 
3880       if (!UseNSA && PackedRegs.size() > 1) {
3881         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3882         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3883         PackedRegs[0] = Concat.getReg(0);
3884         PackedRegs.resize(1);
3885       }
3886 
3887       const int NumPacked = PackedRegs.size();
3888       for (int I = 0; I != NumVAddrs; ++I) {
3889         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3890         if (!SrcOp.isReg()) {
3891           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3892           continue;
3893         }
3894 
3895         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3896 
3897         if (I < NumPacked)
3898           SrcOp.setReg(PackedRegs[I]);
3899         else
3900           SrcOp.setReg(AMDGPU::NoRegister);
3901       }
3902     }
3903   } else {
3904     // If the register allocator cannot place the address registers contiguously
3905     // without introducing moves, then using the non-sequential address encoding
3906     // is always preferable, since it saves VALU instructions and is usually a
3907     // wash in terms of code size or even better.
3908     //
3909     // However, we currently have no way of hinting to the register allocator
3910     // that MIMG addresses should be placed contiguously when it is possible to
3911     // do so, so force non-NSA for the common 2-address case as a heuristic.
3912     //
3913     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3914     // allocation when possible.
3915     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3916 
3917     if (!UseNSA && NumVAddrs > 1)
3918       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3919   }
3920 
3921   int Flags = 0;
3922   if (IsA16)
3923     Flags |= 1;
3924   if (IsG16)
3925     Flags |= 2;
3926   MI.addOperand(MachineOperand::CreateImm(Flags));
3927 
3928   if (BaseOpcode->Store) { // No TFE for stores?
3929     // TODO: Handle dmask trim
3930     Register VData = MI.getOperand(1).getReg();
3931     LLT Ty = MRI->getType(VData);
3932     if (!Ty.isVector() || Ty.getElementType() != S16)
3933       return true;
3934 
3935     Register RepackedReg = handleD16VData(B, *MRI, VData);
3936     if (RepackedReg != VData) {
3937       MI.getOperand(1).setReg(RepackedReg);
3938     }
3939 
3940     return true;
3941   }
3942 
3943   Register DstReg = MI.getOperand(0).getReg();
3944   LLT Ty = MRI->getType(DstReg);
3945   const LLT EltTy = Ty.getScalarType();
3946   const bool IsD16 = Ty.getScalarType() == S16;
3947   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3948 
3949   // Confirm that the return type is large enough for the dmask specified
3950   if (NumElts < DMaskLanes)
3951     return false;
3952 
3953   if (NumElts > 4 || DMaskLanes > 4)
3954     return false;
3955 
3956   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3957   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3958 
3959   // The raw dword aligned data component of the load. The only legal cases
3960   // where this matters should be when using the packed D16 format, for
3961   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3962   LLT RoundedTy;
3963 
3964   // S32 vector to to cover all data, plus TFE result element.
3965   LLT TFETy;
3966 
3967   // Register type to use for each loaded component. Will be S32 or V2S16.
3968   LLT RegTy;
3969 
3970   if (IsD16 && ST.hasUnpackedD16VMem()) {
3971     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3972     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3973     RegTy = S32;
3974   } else {
3975     unsigned EltSize = EltTy.getSizeInBits();
3976     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3977     unsigned RoundedSize = 32 * RoundedElts;
3978     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3979     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3980     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3981   }
3982 
3983   // The return type does not need adjustment.
3984   // TODO: Should we change s16 case to s32 or <2 x s16>?
3985   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3986     return true;
3987 
3988   Register Dst1Reg;
3989 
3990   // Insert after the instruction.
3991   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3992 
3993   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3994   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3995   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3996   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3997 
3998   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3999 
4000   MI.getOperand(0).setReg(NewResultReg);
4001 
4002   // In the IR, TFE is supposed to be used with a 2 element struct return
4003   // type. The intruction really returns these two values in one contiguous
4004   // register, with one additional dword beyond the loaded data. Rewrite the
4005   // return type to use a single register result.
4006 
4007   if (IsTFE) {
4008     Dst1Reg = MI.getOperand(1).getReg();
4009     if (MRI->getType(Dst1Reg) != S32)
4010       return false;
4011 
4012     // TODO: Make sure the TFE operand bit is set.
4013     MI.RemoveOperand(1);
4014 
4015     // Handle the easy case that requires no repack instructions.
4016     if (Ty == S32) {
4017       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4018       return true;
4019     }
4020   }
4021 
4022   // Now figure out how to copy the new result register back into the old
4023   // result.
4024   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4025 
4026   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4027 
4028   if (ResultNumRegs == 1) {
4029     assert(!IsTFE);
4030     ResultRegs[0] = NewResultReg;
4031   } else {
4032     // We have to repack into a new vector of some kind.
4033     for (int I = 0; I != NumDataRegs; ++I)
4034       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4035     B.buildUnmerge(ResultRegs, NewResultReg);
4036 
4037     // Drop the final TFE element to get the data part. The TFE result is
4038     // directly written to the right place already.
4039     if (IsTFE)
4040       ResultRegs.resize(NumDataRegs);
4041   }
4042 
4043   // For an s16 scalar result, we form an s32 result with a truncate regardless
4044   // of packed vs. unpacked.
4045   if (IsD16 && !Ty.isVector()) {
4046     B.buildTrunc(DstReg, ResultRegs[0]);
4047     return true;
4048   }
4049 
4050   // Avoid a build/concat_vector of 1 entry.
4051   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4052     B.buildBitcast(DstReg, ResultRegs[0]);
4053     return true;
4054   }
4055 
4056   assert(Ty.isVector());
4057 
4058   if (IsD16) {
4059     // For packed D16 results with TFE enabled, all the data components are
4060     // S32. Cast back to the expected type.
4061     //
4062     // TODO: We don't really need to use load s32 elements. We would only need one
4063     // cast for the TFE result if a multiple of v2s16 was used.
4064     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4065       for (Register &Reg : ResultRegs)
4066         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4067     } else if (ST.hasUnpackedD16VMem()) {
4068       for (Register &Reg : ResultRegs)
4069         Reg = B.buildTrunc(S16, Reg).getReg(0);
4070     }
4071   }
4072 
4073   auto padWithUndef = [&](LLT Ty, int NumElts) {
4074     if (NumElts == 0)
4075       return;
4076     Register Undef = B.buildUndef(Ty).getReg(0);
4077     for (int I = 0; I != NumElts; ++I)
4078       ResultRegs.push_back(Undef);
4079   };
4080 
4081   // Pad out any elements eliminated due to the dmask.
4082   LLT ResTy = MRI->getType(ResultRegs[0]);
4083   if (!ResTy.isVector()) {
4084     padWithUndef(ResTy, NumElts - ResultRegs.size());
4085     B.buildBuildVector(DstReg, ResultRegs);
4086     return true;
4087   }
4088 
4089   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4090   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4091 
4092   // Deal with the one annoying legal case.
4093   const LLT V3S16 = LLT::vector(3, 16);
4094   if (Ty == V3S16) {
4095     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4096     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4097     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4098     return true;
4099   }
4100 
4101   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4102   B.buildConcatVectors(DstReg, ResultRegs);
4103   return true;
4104 }
4105 
4106 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4107   MachineInstr &MI, MachineIRBuilder &B,
4108   GISelChangeObserver &Observer) const {
4109   Register Dst = MI.getOperand(0).getReg();
4110   LLT Ty = B.getMRI()->getType(Dst);
4111   unsigned Size = Ty.getSizeInBits();
4112   MachineFunction &MF = B.getMF();
4113 
4114   Observer.changingInstr(MI);
4115 
4116   // FIXME: We don't really need this intermediate instruction. The intrinsic
4117   // should be fixed to have a memory operand. Since it's readnone, we're not
4118   // allowed to add one.
4119   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4120   MI.RemoveOperand(1); // Remove intrinsic ID
4121 
4122   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4123   // TODO: Should this use datalayout alignment?
4124   const unsigned MemSize = (Size + 7) / 8;
4125   const Align MemAlign(4);
4126   MachineMemOperand *MMO = MF.getMachineMemOperand(
4127       MachinePointerInfo(),
4128       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4129           MachineMemOperand::MOInvariant,
4130       MemSize, MemAlign);
4131   MI.addMemOperand(MF, MMO);
4132 
4133   // There are no 96-bit result scalar loads, but widening to 128-bit should
4134   // always be legal. We may need to restore this to a 96-bit result if it turns
4135   // out this needs to be converted to a vector load during RegBankSelect.
4136   if (!isPowerOf2_32(Size)) {
4137     LegalizerHelper Helper(MF, *this, Observer, B);
4138 
4139     if (Ty.isVector())
4140       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4141     else
4142       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4143   }
4144 
4145   Observer.changedInstr(MI);
4146   return true;
4147 }
4148 
4149 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4150                                                 MachineRegisterInfo &MRI,
4151                                                 MachineIRBuilder &B) const {
4152   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4153   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4154       !ST.isTrapHandlerEnabled()) {
4155     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4156   } else {
4157     // Pass queue pointer to trap handler as input, and insert trap instruction
4158     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4159     const ArgDescriptor *Arg =
4160         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4161     if (!Arg)
4162       return false;
4163     MachineRegisterInfo &MRI = *B.getMRI();
4164     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4165     Register LiveIn = getLiveInRegister(
4166         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4167         /*InsertLiveInCopy=*/false);
4168     if (!loadInputValue(LiveIn, B, Arg))
4169       return false;
4170     B.buildCopy(SGPR01, LiveIn);
4171     B.buildInstr(AMDGPU::S_TRAP)
4172         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4173         .addReg(SGPR01, RegState::Implicit);
4174   }
4175 
4176   MI.eraseFromParent();
4177   return true;
4178 }
4179 
4180 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4181     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4182   // Is non-HSA path or trap-handler disabled? then, report a warning
4183   // accordingly
4184   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4185       !ST.isTrapHandlerEnabled()) {
4186     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4187                                      "debugtrap handler not supported",
4188                                      MI.getDebugLoc(), DS_Warning);
4189     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4190     Ctx.diagnose(NoTrap);
4191   } else {
4192     // Insert debug-trap instruction
4193     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4194   }
4195 
4196   MI.eraseFromParent();
4197   return true;
4198 }
4199 
4200 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4201                                             MachineInstr &MI) const {
4202   MachineIRBuilder &B = Helper.MIRBuilder;
4203   MachineRegisterInfo &MRI = *B.getMRI();
4204 
4205   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4206   auto IntrID = MI.getIntrinsicID();
4207   switch (IntrID) {
4208   case Intrinsic::amdgcn_if:
4209   case Intrinsic::amdgcn_else: {
4210     MachineInstr *Br = nullptr;
4211     MachineBasicBlock *UncondBrTarget = nullptr;
4212     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4213       const SIRegisterInfo *TRI
4214         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4215 
4216       Register Def = MI.getOperand(1).getReg();
4217       Register Use = MI.getOperand(3).getReg();
4218 
4219       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4220       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4221       if (IntrID == Intrinsic::amdgcn_if) {
4222         B.buildInstr(AMDGPU::SI_IF)
4223           .addDef(Def)
4224           .addUse(Use)
4225           .addMBB(UncondBrTarget);
4226       } else {
4227         B.buildInstr(AMDGPU::SI_ELSE)
4228           .addDef(Def)
4229           .addUse(Use)
4230           .addMBB(UncondBrTarget)
4231           .addImm(0);
4232       }
4233 
4234       if (Br) {
4235         Br->getOperand(0).setMBB(CondBrTarget);
4236       } else {
4237         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4238         // since we're swapping branch targets it needs to be reinserted.
4239         // FIXME: IRTranslator should probably not do this
4240         B.buildBr(*CondBrTarget);
4241       }
4242 
4243       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4244       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4245       MI.eraseFromParent();
4246       BrCond->eraseFromParent();
4247       return true;
4248     }
4249 
4250     return false;
4251   }
4252   case Intrinsic::amdgcn_loop: {
4253     MachineInstr *Br = nullptr;
4254     MachineBasicBlock *UncondBrTarget = nullptr;
4255     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4256       const SIRegisterInfo *TRI
4257         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4258 
4259       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4260       Register Reg = MI.getOperand(2).getReg();
4261 
4262       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4263       B.buildInstr(AMDGPU::SI_LOOP)
4264         .addUse(Reg)
4265         .addMBB(UncondBrTarget);
4266 
4267       if (Br)
4268         Br->getOperand(0).setMBB(CondBrTarget);
4269       else
4270         B.buildBr(*CondBrTarget);
4271 
4272       MI.eraseFromParent();
4273       BrCond->eraseFromParent();
4274       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4275       return true;
4276     }
4277 
4278     return false;
4279   }
4280   case Intrinsic::amdgcn_kernarg_segment_ptr:
4281     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4282       // This only makes sense to call in a kernel, so just lower to null.
4283       B.buildConstant(MI.getOperand(0).getReg(), 0);
4284       MI.eraseFromParent();
4285       return true;
4286     }
4287 
4288     return legalizePreloadedArgIntrin(
4289       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4290   case Intrinsic::amdgcn_implicitarg_ptr:
4291     return legalizeImplicitArgPtr(MI, MRI, B);
4292   case Intrinsic::amdgcn_workitem_id_x:
4293     return legalizePreloadedArgIntrin(MI, MRI, B,
4294                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4295   case Intrinsic::amdgcn_workitem_id_y:
4296     return legalizePreloadedArgIntrin(MI, MRI, B,
4297                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4298   case Intrinsic::amdgcn_workitem_id_z:
4299     return legalizePreloadedArgIntrin(MI, MRI, B,
4300                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4301   case Intrinsic::amdgcn_workgroup_id_x:
4302     return legalizePreloadedArgIntrin(MI, MRI, B,
4303                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4304   case Intrinsic::amdgcn_workgroup_id_y:
4305     return legalizePreloadedArgIntrin(MI, MRI, B,
4306                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4307   case Intrinsic::amdgcn_workgroup_id_z:
4308     return legalizePreloadedArgIntrin(MI, MRI, B,
4309                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4310   case Intrinsic::amdgcn_dispatch_ptr:
4311     return legalizePreloadedArgIntrin(MI, MRI, B,
4312                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4313   case Intrinsic::amdgcn_queue_ptr:
4314     return legalizePreloadedArgIntrin(MI, MRI, B,
4315                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4316   case Intrinsic::amdgcn_implicit_buffer_ptr:
4317     return legalizePreloadedArgIntrin(
4318       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4319   case Intrinsic::amdgcn_dispatch_id:
4320     return legalizePreloadedArgIntrin(MI, MRI, B,
4321                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4322   case Intrinsic::amdgcn_fdiv_fast:
4323     return legalizeFDIVFastIntrin(MI, MRI, B);
4324   case Intrinsic::amdgcn_is_shared:
4325     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4326   case Intrinsic::amdgcn_is_private:
4327     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4328   case Intrinsic::amdgcn_wavefrontsize: {
4329     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4330     MI.eraseFromParent();
4331     return true;
4332   }
4333   case Intrinsic::amdgcn_s_buffer_load:
4334     return legalizeSBufferLoad(MI, B, Helper.Observer);
4335   case Intrinsic::amdgcn_raw_buffer_store:
4336   case Intrinsic::amdgcn_struct_buffer_store:
4337     return legalizeBufferStore(MI, MRI, B, false, false);
4338   case Intrinsic::amdgcn_raw_buffer_store_format:
4339   case Intrinsic::amdgcn_struct_buffer_store_format:
4340     return legalizeBufferStore(MI, MRI, B, false, true);
4341   case Intrinsic::amdgcn_raw_tbuffer_store:
4342   case Intrinsic::amdgcn_struct_tbuffer_store:
4343     return legalizeBufferStore(MI, MRI, B, true, true);
4344   case Intrinsic::amdgcn_raw_buffer_load:
4345   case Intrinsic::amdgcn_struct_buffer_load:
4346     return legalizeBufferLoad(MI, MRI, B, false, false);
4347   case Intrinsic::amdgcn_raw_buffer_load_format:
4348   case Intrinsic::amdgcn_struct_buffer_load_format:
4349     return legalizeBufferLoad(MI, MRI, B, true, false);
4350   case Intrinsic::amdgcn_raw_tbuffer_load:
4351   case Intrinsic::amdgcn_struct_tbuffer_load:
4352     return legalizeBufferLoad(MI, MRI, B, true, true);
4353   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4354   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4355   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4356   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4357   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4358   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4359   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4360   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4361   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4362   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4363   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4367   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4368   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4369   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4370   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4371   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4372   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4373   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4374   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4375   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4376   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4377   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4378   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4379     return legalizeBufferAtomic(MI, B, IntrID);
4380   case Intrinsic::amdgcn_atomic_inc:
4381     return legalizeAtomicIncDec(MI, B, true);
4382   case Intrinsic::amdgcn_atomic_dec:
4383     return legalizeAtomicIncDec(MI, B, false);
4384   case Intrinsic::trap:
4385     return legalizeTrapIntrinsic(MI, MRI, B);
4386   case Intrinsic::debugtrap:
4387     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4388   default: {
4389     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4390             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4391       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4392     return true;
4393   }
4394   }
4395 
4396   return true;
4397 }
4398