1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616     FMad.customFor({S32, S16});
617   else if (ST.hasMadMacF32Insts())
618     FMad.customFor({S32});
619   else if (ST.hasMadF16())
620     FMad.customFor({S16});
621   FMad.scalarize(0)
622       .lower();
623 
624   // TODO: Do we need to clamp maximum bitwidth?
625   getActionDefinitionsBuilder(G_TRUNC)
626     .legalIf(isScalar(0))
627     .legalFor({{V2S16, V2S32}})
628     .clampMaxNumElements(0, S16, 2)
629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630     // situations (like an invalid implicit use), we don't want to infinite loop
631     // in the legalizer.
632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633     .alwaysLegal();
634 
635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637                {S32, S1}, {S64, S1}, {S16, S1}})
638     .scalarize(0)
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(1, 32);
641 
642   // TODO: Split s1->s64 during regbankselect for VALU.
643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645     .lowerFor({{S32, S64}})
646     .lowerIf(typeIs(1, S1))
647     .customFor({{S64, S64}});
648   if (ST.has16BitInsts())
649     IToFP.legalFor({{S16, S16}});
650   IToFP.clampScalar(1, S32, S64)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}});
657   if (ST.has16BitInsts())
658     FPToI.legalFor({{S16, S16}});
659   else
660     FPToI.minScalar(1, S32);
661 
662   FPToI.minScalar(0, S32)
663        .scalarize(0)
664        .lower();
665 
666   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667     .scalarize(0)
668     .lower();
669 
670   if (ST.has16BitInsts()) {
671     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672       .legalFor({S16, S32, S64})
673       .clampScalar(0, S16, S64)
674       .scalarize(0);
675   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
676     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
677       .legalFor({S32, S64})
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   } else {
681     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
682       .legalFor({S32})
683       .customFor({S64})
684       .clampScalar(0, S32, S64)
685       .scalarize(0);
686   }
687 
688   // FIXME: Clamp offset operand.
689   getActionDefinitionsBuilder(G_PTR_ADD)
690     .legalIf(isPointer(0))
691     .scalarize(0);
692 
693   getActionDefinitionsBuilder(G_PTRMASK)
694     .legalIf(typeInSet(1, {S64, S32}))
695     .minScalar(1, S32)
696     .maxScalarIf(sizeIs(0, 32), 1, S32)
697     .maxScalarIf(sizeIs(0, 64), 1, S64)
698     .scalarize(0);
699 
700   auto &CmpBuilder =
701     getActionDefinitionsBuilder(G_ICMP)
702     // The compare output type differs based on the register bank of the output,
703     // so make both s1 and s32 legal.
704     //
705     // Scalar compares producing output in scc will be promoted to s32, as that
706     // is the allocatable register type that will be needed for the copy from
707     // scc. This will be promoted during RegBankSelect, and we assume something
708     // before that won't try to use s32 result types.
709     //
710     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711     // bank.
712     .legalForCartesianProduct(
713       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714     .legalForCartesianProduct(
715       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
716   if (ST.has16BitInsts()) {
717     CmpBuilder.legalFor({{S1, S16}});
718   }
719 
720   CmpBuilder
721     .widenScalarToNextPow2(1)
722     .clampScalar(1, S32, S64)
723     .scalarize(0)
724     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
725 
726   getActionDefinitionsBuilder(G_FCMP)
727     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
728     .widenScalarToNextPow2(1)
729     .clampScalar(1, S32, S64)
730     .scalarize(0);
731 
732   // FIXME: fpow has a selection pattern that should move to custom lowering.
733   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734   if (ST.has16BitInsts())
735     Exp2Ops.legalFor({S32, S16});
736   else
737     Exp2Ops.legalFor({S32});
738   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739   Exp2Ops.scalarize(0);
740 
741   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742   if (ST.has16BitInsts())
743     ExpOps.customFor({{S32}, {S16}});
744   else
745     ExpOps.customFor({S32});
746   ExpOps.clampScalar(0, MinScalarFPTy, S32)
747         .scalarize(0);
748 
749   // The 64-bit versions produce 32-bit results, but only on the SALU.
750   getActionDefinitionsBuilder(G_CTPOP)
751     .legalFor({{S32, S32}, {S32, S64}})
752     .clampScalar(0, S32, S32)
753     .clampScalar(1, S32, S64)
754     .scalarize(0)
755     .widenScalarToNextPow2(0, 32)
756     .widenScalarToNextPow2(1, 32);
757 
758   // The hardware instructions return a different result on 0 than the generic
759   // instructions expect. The hardware produces -1, but these produce the
760   // bitwidth.
761   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762     .scalarize(0)
763     .clampScalar(0, S32, S32)
764     .clampScalar(1, S32, S64)
765     .widenScalarToNextPow2(0, 32)
766     .widenScalarToNextPow2(1, 32)
767     .lower();
768 
769   // The 64-bit versions produce 32-bit results, but only on the SALU.
770   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771     .legalFor({{S32, S32}, {S32, S64}})
772     .clampScalar(0, S32, S32)
773     .clampScalar(1, S32, S64)
774     .scalarize(0)
775     .widenScalarToNextPow2(0, 32)
776     .widenScalarToNextPow2(1, 32);
777 
778   getActionDefinitionsBuilder(G_BITREVERSE)
779     .legalFor({S32})
780     .clampScalar(0, S32, S32)
781     .scalarize(0);
782 
783   if (ST.has16BitInsts()) {
784     getActionDefinitionsBuilder(G_BSWAP)
785       .legalFor({S16, S32, V2S16})
786       .clampMaxNumElements(0, S16, 2)
787       // FIXME: Fixing non-power-of-2 before clamp is workaround for
788       // narrowScalar limitation.
789       .widenScalarToNextPow2(0)
790       .clampScalar(0, S16, S32)
791       .scalarize(0);
792 
793     if (ST.hasVOP3PInsts()) {
794       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
795         .legalFor({S32, S16, V2S16})
796         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
797         .clampMaxNumElements(0, S16, 2)
798         .minScalar(0, S16)
799         .widenScalarToNextPow2(0)
800         .scalarize(0)
801         .lower();
802     } else {
803       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
804         .legalFor({S32, S16})
805         .widenScalarToNextPow2(0)
806         .minScalar(0, S16)
807         .scalarize(0)
808         .lower();
809     }
810   } else {
811     // TODO: Should have same legality without v_perm_b32
812     getActionDefinitionsBuilder(G_BSWAP)
813       .legalFor({S32})
814       .lowerIf(scalarNarrowerThan(0, 32))
815       // FIXME: Fixing non-power-of-2 before clamp is workaround for
816       // narrowScalar limitation.
817       .widenScalarToNextPow2(0)
818       .maxScalar(0, S32)
819       .scalarize(0)
820       .lower();
821 
822     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
823       .legalFor({S32})
824       .minScalar(0, S32)
825       .widenScalarToNextPow2(0)
826       .scalarize(0)
827       .lower();
828   }
829 
830   getActionDefinitionsBuilder(G_INTTOPTR)
831     // List the common cases
832     .legalForCartesianProduct(AddrSpaces64, {S64})
833     .legalForCartesianProduct(AddrSpaces32, {S32})
834     .scalarize(0)
835     // Accept any address space as long as the size matches
836     .legalIf(sameSize(0, 1))
837     .widenScalarIf(smallerThan(1, 0),
838       [](const LegalityQuery &Query) {
839         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
840       })
841     .narrowScalarIf(largerThan(1, 0),
842       [](const LegalityQuery &Query) {
843         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
844       });
845 
846   getActionDefinitionsBuilder(G_PTRTOINT)
847     // List the common cases
848     .legalForCartesianProduct(AddrSpaces64, {S64})
849     .legalForCartesianProduct(AddrSpaces32, {S32})
850     .scalarize(0)
851     // Accept any address space as long as the size matches
852     .legalIf(sameSize(0, 1))
853     .widenScalarIf(smallerThan(0, 1),
854       [](const LegalityQuery &Query) {
855         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
856       })
857     .narrowScalarIf(
858       largerThan(0, 1),
859       [](const LegalityQuery &Query) {
860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861       });
862 
863   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
864     .scalarize(0)
865     .custom();
866 
867   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868                                     bool IsLoad) -> bool {
869     const LLT DstTy = Query.Types[0];
870 
871     // Split vector extloads.
872     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873     unsigned Align = Query.MMODescrs[0].AlignInBits;
874 
875     if (MemSize < DstTy.getSizeInBits())
876       MemSize = std::max(MemSize, Align);
877 
878     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
879       return true;
880 
881     const LLT PtrTy = Query.Types[1];
882     unsigned AS = PtrTy.getAddressSpace();
883     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
884       return true;
885 
886     // Catch weird sized loads that don't evenly divide into the access sizes
887     // TODO: May be able to widen depending on alignment etc.
888     unsigned NumRegs = (MemSize + 31) / 32;
889     if (NumRegs == 3) {
890       if (!ST.hasDwordx3LoadStores())
891         return true;
892     } else {
893       // If the alignment allows, these should have been widened.
894       if (!isPowerOf2_32(NumRegs))
895         return true;
896     }
897 
898     if (Align < MemSize) {
899       const SITargetLowering *TLI = ST.getTargetLowering();
900       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
901     }
902 
903     return false;
904   };
905 
906   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907                                          unsigned Opc) -> bool {
908     unsigned Size = Query.Types[0].getSizeInBits();
909     if (isPowerOf2_32(Size))
910       return false;
911 
912     if (Size == 96 && ST.hasDwordx3LoadStores())
913       return false;
914 
915     unsigned AddrSpace = Query.Types[1].getAddressSpace();
916     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917       return false;
918 
919     unsigned Align = Query.MMODescrs[0].AlignInBits;
920     unsigned RoundedSize = NextPowerOf2(Size);
921     return (Align >= RoundedSize);
922   };
923 
924   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
925   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
926   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
927 
928   // TODO: Refine based on subtargets which support unaligned access or 128-bit
929   // LDS
930   // TODO: Unsupported flat for SI.
931 
932   for (unsigned Op : {G_LOAD, G_STORE}) {
933     const bool IsStore = Op == G_STORE;
934 
935     auto &Actions = getActionDefinitionsBuilder(Op);
936     // Explicitly list some common cases.
937     // TODO: Does this help compile time at all?
938     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
939                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
940                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
941                                       {S64, GlobalPtr, 64, GlobalAlign32},
942                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
943                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
944                                       {S32, GlobalPtr, 8, GlobalAlign8},
945                                       {S32, GlobalPtr, 16, GlobalAlign16},
946 
947                                       {S32, LocalPtr, 32, 32},
948                                       {S64, LocalPtr, 64, 32},
949                                       {V2S32, LocalPtr, 64, 32},
950                                       {S32, LocalPtr, 8, 8},
951                                       {S32, LocalPtr, 16, 16},
952                                       {V2S16, LocalPtr, 32, 32},
953 
954                                       {S32, PrivatePtr, 32, 32},
955                                       {S32, PrivatePtr, 8, 8},
956                                       {S32, PrivatePtr, 16, 16},
957                                       {V2S16, PrivatePtr, 32, 32},
958 
959                                       {S32, ConstantPtr, 32, GlobalAlign32},
960                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
961                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
962                                       {S64, ConstantPtr, 64, GlobalAlign32},
963                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
964     Actions.legalIf(
965       [=](const LegalityQuery &Query) -> bool {
966         return isLoadStoreLegal(ST, Query, Op);
967       });
968 
969     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970     // 64-bits.
971     //
972     // TODO: Should generalize bitcast action into coerce, which will also cover
973     // inserting addrspacecasts.
974     Actions.customIf(typeIs(1, Constant32Ptr));
975 
976     // Turn any illegal element vectors into something easier to deal
977     // with. These will ultimately produce 32-bit scalar shifts to extract the
978     // parts anyway.
979     //
980     // For odd 16-bit element vectors, prefer to split those into pieces with
981     // 16-bit vector parts.
982     Actions.bitcastIf(
983       [=](const LegalityQuery &Query) -> bool {
984         const LLT Ty = Query.Types[0];
985         const unsigned Size = Ty.getSizeInBits();
986 
987         if (Size != Query.MMODescrs[0].SizeInBits)
988           return Size <= 32 && Ty.isVector();
989 
990         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991           return true;
992         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
993                !isRegisterVectorElementType(Ty.getElementType());
994       }, bitcastToRegisterType(0));
995 
996     Actions
997         .customIf(typeIs(1, Constant32Ptr))
998         // Widen suitably aligned loads by loading extra elements.
999         .moreElementsIf([=](const LegalityQuery &Query) {
1000             const LLT Ty = Query.Types[0];
1001             return Op == G_LOAD && Ty.isVector() &&
1002                    shouldWidenLoadResult(Query, Op);
1003           }, moreElementsToNextPow2(0))
1004         .widenScalarIf([=](const LegalityQuery &Query) {
1005             const LLT Ty = Query.Types[0];
1006             return Op == G_LOAD && !Ty.isVector() &&
1007                    shouldWidenLoadResult(Query, Op);
1008           }, widenScalarOrEltToNextPow2(0))
1009         .narrowScalarIf(
1010             [=](const LegalityQuery &Query) -> bool {
1011               return !Query.Types[0].isVector() &&
1012                      needToSplitMemOp(Query, Op == G_LOAD);
1013             },
1014             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1015               const LLT DstTy = Query.Types[0];
1016               const LLT PtrTy = Query.Types[1];
1017 
1018               const unsigned DstSize = DstTy.getSizeInBits();
1019               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020 
1021               // Split extloads.
1022               if (DstSize > MemSize)
1023                 return std::make_pair(0, LLT::scalar(MemSize));
1024 
1025               if (!isPowerOf2_32(DstSize)) {
1026                 // We're probably decomposing an odd sized store. Try to split
1027                 // to the widest type. TODO: Account for alignment. As-is it
1028                 // should be OK, since the new parts will be further legalized.
1029                 unsigned FloorSize = PowerOf2Floor(DstSize);
1030                 return std::make_pair(0, LLT::scalar(FloorSize));
1031               }
1032 
1033               if (DstSize > 32 && (DstSize % 32 != 0)) {
1034                 // FIXME: Need a way to specify non-extload of larger size if
1035                 // suitably aligned.
1036                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1037               }
1038 
1039               unsigned MaxSize = maxSizeForAddrSpace(ST,
1040                                                      PtrTy.getAddressSpace(),
1041                                                      Op == G_LOAD);
1042               if (MemSize > MaxSize)
1043                 return std::make_pair(0, LLT::scalar(MaxSize));
1044 
1045               unsigned Align = Query.MMODescrs[0].AlignInBits;
1046               return std::make_pair(0, LLT::scalar(Align));
1047             })
1048         .fewerElementsIf(
1049             [=](const LegalityQuery &Query) -> bool {
1050               return Query.Types[0].isVector() &&
1051                      needToSplitMemOp(Query, Op == G_LOAD);
1052             },
1053             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1054               const LLT DstTy = Query.Types[0];
1055               const LLT PtrTy = Query.Types[1];
1056 
1057               LLT EltTy = DstTy.getElementType();
1058               unsigned MaxSize = maxSizeForAddrSpace(ST,
1059                                                      PtrTy.getAddressSpace(),
1060                                                      Op == G_LOAD);
1061 
1062               // FIXME: Handle widened to power of 2 results better. This ends
1063               // up scalarizing.
1064               // FIXME: 3 element stores scalarized on SI
1065 
1066               // Split if it's too large for the address space.
1067               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1068                 unsigned NumElts = DstTy.getNumElements();
1069                 unsigned EltSize = EltTy.getSizeInBits();
1070 
1071                 if (MaxSize % EltSize == 0) {
1072                   return std::make_pair(
1073                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074                 }
1075 
1076                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1077 
1078                 // FIXME: Refine when odd breakdowns handled
1079                 // The scalars will need to be re-legalized.
1080                 if (NumPieces == 1 || NumPieces >= NumElts ||
1081                     NumElts % NumPieces != 0)
1082                   return std::make_pair(0, EltTy);
1083 
1084                 return std::make_pair(0,
1085                                       LLT::vector(NumElts / NumPieces, EltTy));
1086               }
1087 
1088               // FIXME: We could probably handle weird extending loads better.
1089               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090               if (DstTy.getSizeInBits() > MemSize)
1091                 return std::make_pair(0, EltTy);
1092 
1093               unsigned EltSize = EltTy.getSizeInBits();
1094               unsigned DstSize = DstTy.getSizeInBits();
1095               if (!isPowerOf2_32(DstSize)) {
1096                 // We're probably decomposing an odd sized store. Try to split
1097                 // to the widest type. TODO: Account for alignment. As-is it
1098                 // should be OK, since the new parts will be further legalized.
1099                 unsigned FloorSize = PowerOf2Floor(DstSize);
1100                 return std::make_pair(
1101                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102               }
1103 
1104               // Need to split because of alignment.
1105               unsigned Align = Query.MMODescrs[0].AlignInBits;
1106               if (EltSize > Align &&
1107                   (EltSize / Align < DstTy.getNumElements())) {
1108                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1109               }
1110 
1111               // May need relegalization for the scalars.
1112               return std::make_pair(0, EltTy);
1113             })
1114         .minScalar(0, S32);
1115 
1116     if (IsStore)
1117       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1118 
1119     // TODO: Need a bitcast lower option?
1120     Actions
1121         .widenScalarToNextPow2(0)
1122         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1123   }
1124 
1125   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1126                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1127                                                   {S32, GlobalPtr, 16, 2 * 8},
1128                                                   {S32, LocalPtr, 8, 8},
1129                                                   {S32, LocalPtr, 16, 16},
1130                                                   {S32, PrivatePtr, 8, 8},
1131                                                   {S32, PrivatePtr, 16, 16},
1132                                                   {S32, ConstantPtr, 8, 8},
1133                                                   {S32, ConstantPtr, 16, 2 * 8}});
1134   if (ST.hasFlatAddressSpace()) {
1135     ExtLoads.legalForTypesWithMemDesc(
1136         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1137   }
1138 
1139   ExtLoads.clampScalar(0, S32, S32)
1140           .widenScalarToNextPow2(0)
1141           .unsupportedIfMemSizeNotPow2()
1142           .lower();
1143 
1144   auto &Atomics = getActionDefinitionsBuilder(
1145     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1146      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1147      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148      G_ATOMICRMW_UMIN})
1149     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1150                {S64, GlobalPtr}, {S64, LocalPtr}});
1151   if (ST.hasFlatAddressSpace()) {
1152     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1153   }
1154 
1155   if (ST.hasLDSFPAtomics()) {
1156     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1157       .legalFor({{S32, LocalPtr}});
1158   }
1159 
1160   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161   // demarshalling
1162   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164                 {S32, FlatPtr}, {S64, FlatPtr}})
1165     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166                {S32, RegionPtr}, {S64, RegionPtr}});
1167   // TODO: Pointer types, any 32-bit or 64-bit vector
1168 
1169   // Condition should be s32 for scalar, s1 for vector.
1170   getActionDefinitionsBuilder(G_SELECT)
1171     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1172           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1174     .clampScalar(0, S16, S64)
1175     .scalarize(1)
1176     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1177     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1178     .clampMaxNumElements(0, S32, 2)
1179     .clampMaxNumElements(0, LocalPtr, 2)
1180     .clampMaxNumElements(0, PrivatePtr, 2)
1181     .scalarize(0)
1182     .widenScalarToNextPow2(0)
1183     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1184 
1185   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1186   // be more flexible with the shift amount type.
1187   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1188     .legalFor({{S32, S32}, {S64, S32}});
1189   if (ST.has16BitInsts()) {
1190     if (ST.hasVOP3PInsts()) {
1191       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1192             .clampMaxNumElements(0, S16, 2);
1193     } else
1194       Shifts.legalFor({{S16, S16}});
1195 
1196     // TODO: Support 16-bit shift amounts for all types
1197     Shifts.widenScalarIf(
1198       [=](const LegalityQuery &Query) {
1199         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200         // 32-bit amount.
1201         const LLT ValTy = Query.Types[0];
1202         const LLT AmountTy = Query.Types[1];
1203         return ValTy.getSizeInBits() <= 16 &&
1204                AmountTy.getSizeInBits() < 16;
1205       }, changeTo(1, S16));
1206     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207     Shifts.clampScalar(1, S32, S32);
1208     Shifts.clampScalar(0, S16, S64);
1209     Shifts.widenScalarToNextPow2(0, 16);
1210   } else {
1211     // Make sure we legalize the shift amount type first, as the general
1212     // expansion for the shifted type will produce much worse code if it hasn't
1213     // been truncated already.
1214     Shifts.clampScalar(1, S32, S32);
1215     Shifts.clampScalar(0, S32, S64);
1216     Shifts.widenScalarToNextPow2(0, 32);
1217   }
1218   Shifts.scalarize(0);
1219 
1220   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1221     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1222     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1223     unsigned IdxTypeIdx = 2;
1224 
1225     getActionDefinitionsBuilder(Op)
1226       .customIf([=](const LegalityQuery &Query) {
1227           const LLT EltTy = Query.Types[EltTypeIdx];
1228           const LLT VecTy = Query.Types[VecTypeIdx];
1229           const LLT IdxTy = Query.Types[IdxTypeIdx];
1230           return (EltTy.getSizeInBits() == 16 ||
1231                   EltTy.getSizeInBits() % 32 == 0) &&
1232                  VecTy.getSizeInBits() % 32 == 0 &&
1233                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1234                  IdxTy.getSizeInBits() == 32;
1235         })
1236       .clampScalar(EltTypeIdx, S32, S64)
1237       .clampScalar(VecTypeIdx, S32, S64)
1238       .clampScalar(IdxTypeIdx, S32, S32);
1239   }
1240 
1241   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1242     .unsupportedIf([=](const LegalityQuery &Query) {
1243         const LLT &EltTy = Query.Types[1].getElementType();
1244         return Query.Types[0] != EltTy;
1245       });
1246 
1247   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1248     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1249     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1250 
1251     // FIXME: Doesn't handle extract of illegal sizes.
1252     getActionDefinitionsBuilder(Op)
1253       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1254       // FIXME: Multiples of 16 should not be legal.
1255       .legalIf([=](const LegalityQuery &Query) {
1256           const LLT BigTy = Query.Types[BigTyIdx];
1257           const LLT LitTy = Query.Types[LitTyIdx];
1258           return (BigTy.getSizeInBits() % 32 == 0) &&
1259                  (LitTy.getSizeInBits() % 16 == 0);
1260         })
1261       .widenScalarIf(
1262         [=](const LegalityQuery &Query) {
1263           const LLT BigTy = Query.Types[BigTyIdx];
1264           return (BigTy.getScalarSizeInBits() < 16);
1265         },
1266         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1267       .widenScalarIf(
1268         [=](const LegalityQuery &Query) {
1269           const LLT LitTy = Query.Types[LitTyIdx];
1270           return (LitTy.getScalarSizeInBits() < 16);
1271         },
1272         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1273       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1274       .widenScalarToNextPow2(BigTyIdx, 32);
1275 
1276   }
1277 
1278   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1279     .legalForCartesianProduct(AllS32Vectors, {S32})
1280     .legalForCartesianProduct(AllS64Vectors, {S64})
1281     .clampNumElements(0, V16S32, V32S32)
1282     .clampNumElements(0, V2S64, V16S64)
1283     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1284 
1285   if (ST.hasScalarPackInsts()) {
1286     BuildVector
1287       // FIXME: Should probably widen s1 vectors straight to s32
1288       .minScalarOrElt(0, S16)
1289       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290       .minScalar(1, S32);
1291 
1292     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1293       .legalFor({V2S16, S32})
1294       .lower();
1295     BuildVector.minScalarOrElt(0, S32);
1296   } else {
1297     BuildVector.customFor({V2S16, S16});
1298     BuildVector.minScalarOrElt(0, S32);
1299 
1300     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301       .customFor({V2S16, S32})
1302       .lower();
1303   }
1304 
1305   BuildVector.legalIf(isRegisterType(0));
1306 
1307   // FIXME: Clamp maximum size
1308   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1309     .legalIf(isRegisterType(0));
1310 
1311   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312   // pre-legalize.
1313   if (ST.hasVOP3PInsts()) {
1314     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315       .customFor({V2S16, V2S16})
1316       .lower();
1317   } else
1318     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1319 
1320   // Merge/Unmerge
1321   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1322     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1323     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1324 
1325     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326       const LLT Ty = Query.Types[TypeIdx];
1327       if (Ty.isVector()) {
1328         const LLT &EltTy = Ty.getElementType();
1329         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1330           return true;
1331         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1332           return true;
1333       }
1334       return false;
1335     };
1336 
1337     auto &Builder = getActionDefinitionsBuilder(Op)
1338       .lowerFor({{S16, V2S16}})
1339       .lowerIf([=](const LegalityQuery &Query) {
1340           const LLT BigTy = Query.Types[BigTyIdx];
1341           return BigTy.getSizeInBits() == 32;
1342         })
1343       // Try to widen to s16 first for small types.
1344       // TODO: Only do this on targets with legal s16 shifts
1345       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1346       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1347       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1348       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1349                            elementTypeIs(1, S16)),
1350                        changeTo(1, V2S16))
1351       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353       // valid.
1354       .clampScalar(LitTyIdx, S32, S512)
1355       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1356       // Break up vectors with weird elements into scalars
1357       .fewerElementsIf(
1358         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1359         scalarize(0))
1360       .fewerElementsIf(
1361         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1362         scalarize(1))
1363       .clampScalar(BigTyIdx, S32, MaxScalar);
1364 
1365     if (Op == G_MERGE_VALUES) {
1366       Builder.widenScalarIf(
1367         // TODO: Use 16-bit shifts if legal for 8-bit values?
1368         [=](const LegalityQuery &Query) {
1369           const LLT Ty = Query.Types[LitTyIdx];
1370           return Ty.getSizeInBits() < 32;
1371         },
1372         changeTo(LitTyIdx, S32));
1373     }
1374 
1375     Builder.widenScalarIf(
1376       [=](const LegalityQuery &Query) {
1377         const LLT Ty = Query.Types[BigTyIdx];
1378         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1379           Ty.getSizeInBits() % 16 != 0;
1380       },
1381       [=](const LegalityQuery &Query) {
1382         // Pick the next power of 2, or a multiple of 64 over 128.
1383         // Whichever is smaller.
1384         const LLT &Ty = Query.Types[BigTyIdx];
1385         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1386         if (NewSizeInBits >= 256) {
1387           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1388           if (RoundedTo < NewSizeInBits)
1389             NewSizeInBits = RoundedTo;
1390         }
1391         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1392       })
1393       .legalIf([=](const LegalityQuery &Query) {
1394           const LLT &BigTy = Query.Types[BigTyIdx];
1395           const LLT &LitTy = Query.Types[LitTyIdx];
1396 
1397           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1398             return false;
1399           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1400             return false;
1401 
1402           return BigTy.getSizeInBits() % 16 == 0 &&
1403                  LitTy.getSizeInBits() % 16 == 0 &&
1404                  BigTy.getSizeInBits() <= MaxRegisterSize;
1405         })
1406       // Any vectors left are the wrong size. Scalarize them.
1407       .scalarize(0)
1408       .scalarize(1);
1409   }
1410 
1411   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412   // RegBankSelect.
1413   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414     .legalFor({{S32}, {S64}});
1415 
1416   if (ST.hasVOP3PInsts()) {
1417     SextInReg.lowerFor({{V2S16}})
1418       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419       // get more vector shift opportunities, since we'll get those when
1420       // expanded.
1421       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422   } else if (ST.has16BitInsts()) {
1423     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424   } else {
1425     // Prefer to promote to s32 before lowering if we don't have 16-bit
1426     // shifts. This avoid a lot of intermediate truncate and extend operations.
1427     SextInReg.lowerFor({{S32}, {S64}});
1428   }
1429 
1430   SextInReg
1431     .scalarize(0)
1432     .clampScalar(0, S32, S64)
1433     .lower();
1434 
1435   getActionDefinitionsBuilder(G_FSHR)
1436     .legalFor({{S32, S32}})
1437     .scalarize(0)
1438     .lower();
1439 
1440   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1441     .legalFor({S64});
1442 
1443   getActionDefinitionsBuilder({
1444       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1445       G_FCOPYSIGN,
1446 
1447       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1448       G_READ_REGISTER,
1449       G_WRITE_REGISTER,
1450 
1451       G_SADDO, G_SSUBO,
1452 
1453        // TODO: Implement
1454       G_FMINIMUM, G_FMAXIMUM,
1455       G_FSHL
1456     }).lower();
1457 
1458   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1459         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1460         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1461     .unsupported();
1462 
1463   computeTables();
1464   verify(*ST.getInstrInfo());
1465 }
1466 
1467 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1468                                          MachineInstr &MI) const {
1469   MachineIRBuilder &B = Helper.MIRBuilder;
1470   MachineRegisterInfo &MRI = *B.getMRI();
1471   GISelChangeObserver &Observer = Helper.Observer;
1472 
1473   switch (MI.getOpcode()) {
1474   case TargetOpcode::G_ADDRSPACE_CAST:
1475     return legalizeAddrSpaceCast(MI, MRI, B);
1476   case TargetOpcode::G_FRINT:
1477     return legalizeFrint(MI, MRI, B);
1478   case TargetOpcode::G_FCEIL:
1479     return legalizeFceil(MI, MRI, B);
1480   case TargetOpcode::G_INTRINSIC_TRUNC:
1481     return legalizeIntrinsicTrunc(MI, MRI, B);
1482   case TargetOpcode::G_SITOFP:
1483     return legalizeITOFP(MI, MRI, B, true);
1484   case TargetOpcode::G_UITOFP:
1485     return legalizeITOFP(MI, MRI, B, false);
1486   case TargetOpcode::G_FPTOSI:
1487     return legalizeFPTOI(MI, MRI, B, true);
1488   case TargetOpcode::G_FPTOUI:
1489     return legalizeFPTOI(MI, MRI, B, false);
1490   case TargetOpcode::G_FMINNUM:
1491   case TargetOpcode::G_FMAXNUM:
1492   case TargetOpcode::G_FMINNUM_IEEE:
1493   case TargetOpcode::G_FMAXNUM_IEEE:
1494     return legalizeMinNumMaxNum(Helper, MI);
1495   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1496     return legalizeExtractVectorElt(MI, MRI, B);
1497   case TargetOpcode::G_INSERT_VECTOR_ELT:
1498     return legalizeInsertVectorElt(MI, MRI, B);
1499   case TargetOpcode::G_SHUFFLE_VECTOR:
1500     return legalizeShuffleVector(MI, MRI, B);
1501   case TargetOpcode::G_FSIN:
1502   case TargetOpcode::G_FCOS:
1503     return legalizeSinCos(MI, MRI, B);
1504   case TargetOpcode::G_GLOBAL_VALUE:
1505     return legalizeGlobalValue(MI, MRI, B);
1506   case TargetOpcode::G_LOAD:
1507     return legalizeLoad(MI, MRI, B, Observer);
1508   case TargetOpcode::G_FMAD:
1509     return legalizeFMad(MI, MRI, B);
1510   case TargetOpcode::G_FDIV:
1511     return legalizeFDIV(MI, MRI, B);
1512   case TargetOpcode::G_UDIV:
1513   case TargetOpcode::G_UREM:
1514     return legalizeUDIV_UREM(MI, MRI, B);
1515   case TargetOpcode::G_SDIV:
1516   case TargetOpcode::G_SREM:
1517     return legalizeSDIV_SREM(MI, MRI, B);
1518   case TargetOpcode::G_ATOMIC_CMPXCHG:
1519     return legalizeAtomicCmpXChg(MI, MRI, B);
1520   case TargetOpcode::G_FLOG:
1521     return legalizeFlog(MI, B, numbers::ln2f);
1522   case TargetOpcode::G_FLOG10:
1523     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1524   case TargetOpcode::G_FEXP:
1525     return legalizeFExp(MI, B);
1526   case TargetOpcode::G_FPOW:
1527     return legalizeFPow(MI, B);
1528   case TargetOpcode::G_FFLOOR:
1529     return legalizeFFloor(MI, MRI, B);
1530   case TargetOpcode::G_BUILD_VECTOR:
1531     return legalizeBuildVector(MI, MRI, B);
1532   default:
1533     return false;
1534   }
1535 
1536   llvm_unreachable("expected switch to return");
1537 }
1538 
1539 Register AMDGPULegalizerInfo::getSegmentAperture(
1540   unsigned AS,
1541   MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B) const {
1543   MachineFunction &MF = B.getMF();
1544   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1545   const LLT S32 = LLT::scalar(32);
1546 
1547   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1548 
1549   if (ST.hasApertureRegs()) {
1550     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1551     // getreg.
1552     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1553         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1554         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1555     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1556         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1557         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1558     unsigned Encoding =
1559         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1560         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1561         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1562 
1563     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1564 
1565     B.buildInstr(AMDGPU::S_GETREG_B32)
1566       .addDef(GetReg)
1567       .addImm(Encoding);
1568     MRI.setType(GetReg, S32);
1569 
1570     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1571     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1572   }
1573 
1574   Register QueuePtr = MRI.createGenericVirtualRegister(
1575     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1576 
1577   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1578   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1579     return Register();
1580 
1581   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1582   // private_segment_aperture_base_hi.
1583   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1584 
1585   // TODO: can we be smarter about machine pointer info?
1586   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1587   MachineMemOperand *MMO = MF.getMachineMemOperand(
1588       PtrInfo,
1589       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1590           MachineMemOperand::MOInvariant,
1591       4, commonAlignment(Align(64), StructOffset));
1592 
1593   Register LoadAddr;
1594 
1595   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1596   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1597 }
1598 
1599 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1600   MachineInstr &MI, MachineRegisterInfo &MRI,
1601   MachineIRBuilder &B) const {
1602   MachineFunction &MF = B.getMF();
1603 
1604   const LLT S32 = LLT::scalar(32);
1605   Register Dst = MI.getOperand(0).getReg();
1606   Register Src = MI.getOperand(1).getReg();
1607 
1608   LLT DstTy = MRI.getType(Dst);
1609   LLT SrcTy = MRI.getType(Src);
1610   unsigned DestAS = DstTy.getAddressSpace();
1611   unsigned SrcAS = SrcTy.getAddressSpace();
1612 
1613   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1614   // vector element.
1615   assert(!DstTy.isVector());
1616 
1617   const AMDGPUTargetMachine &TM
1618     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1619 
1620   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1621   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1622     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1623     return true;
1624   }
1625 
1626   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1627     // Truncate.
1628     B.buildExtract(Dst, Src, 0);
1629     MI.eraseFromParent();
1630     return true;
1631   }
1632 
1633   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1634     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1635     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1636 
1637     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1638     // another. Merge operands are required to be the same type, but creating an
1639     // extra ptrtoint would be kind of pointless.
1640     auto HighAddr = B.buildConstant(
1641       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1642     B.buildMerge(Dst, {Src, HighAddr});
1643     MI.eraseFromParent();
1644     return true;
1645   }
1646 
1647   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1648     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1649            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1650     unsigned NullVal = TM.getNullPointerValue(DestAS);
1651 
1652     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1653     auto FlatNull = B.buildConstant(SrcTy, 0);
1654 
1655     // Extract low 32-bits of the pointer.
1656     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1657 
1658     auto CmpRes =
1659         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1660     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1661 
1662     MI.eraseFromParent();
1663     return true;
1664   }
1665 
1666   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1667     return false;
1668 
1669   if (!ST.hasFlatAddressSpace())
1670     return false;
1671 
1672   auto SegmentNull =
1673       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1674   auto FlatNull =
1675       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1676 
1677   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1678   if (!ApertureReg.isValid())
1679     return false;
1680 
1681   auto CmpRes =
1682       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1683 
1684   // Coerce the type of the low half of the result so we can use merge_values.
1685   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1686 
1687   // TODO: Should we allow mismatched types but matching sizes in merges to
1688   // avoid the ptrtoint?
1689   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1690   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1691 
1692   MI.eraseFromParent();
1693   return true;
1694 }
1695 
1696 bool AMDGPULegalizerInfo::legalizeFrint(
1697   MachineInstr &MI, MachineRegisterInfo &MRI,
1698   MachineIRBuilder &B) const {
1699   Register Src = MI.getOperand(1).getReg();
1700   LLT Ty = MRI.getType(Src);
1701   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1702 
1703   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1704   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1705 
1706   auto C1 = B.buildFConstant(Ty, C1Val);
1707   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1708 
1709   // TODO: Should this propagate fast-math-flags?
1710   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1711   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1712 
1713   auto C2 = B.buildFConstant(Ty, C2Val);
1714   auto Fabs = B.buildFAbs(Ty, Src);
1715 
1716   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1717   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1718   return true;
1719 }
1720 
1721 bool AMDGPULegalizerInfo::legalizeFceil(
1722   MachineInstr &MI, MachineRegisterInfo &MRI,
1723   MachineIRBuilder &B) const {
1724 
1725   const LLT S1 = LLT::scalar(1);
1726   const LLT S64 = LLT::scalar(64);
1727 
1728   Register Src = MI.getOperand(1).getReg();
1729   assert(MRI.getType(Src) == S64);
1730 
1731   // result = trunc(src)
1732   // if (src > 0.0 && src != result)
1733   //   result += 1.0
1734 
1735   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1736 
1737   const auto Zero = B.buildFConstant(S64, 0.0);
1738   const auto One = B.buildFConstant(S64, 1.0);
1739   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1740   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1741   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1742   auto Add = B.buildSelect(S64, And, One, Zero);
1743 
1744   // TODO: Should this propagate fast-math-flags?
1745   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1746   return true;
1747 }
1748 
1749 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1750                                               MachineIRBuilder &B) {
1751   const unsigned FractBits = 52;
1752   const unsigned ExpBits = 11;
1753   LLT S32 = LLT::scalar(32);
1754 
1755   auto Const0 = B.buildConstant(S32, FractBits - 32);
1756   auto Const1 = B.buildConstant(S32, ExpBits);
1757 
1758   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1759     .addUse(Const0.getReg(0))
1760     .addUse(Const1.getReg(0));
1761 
1762   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1763 }
1764 
1765 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1766   MachineInstr &MI, MachineRegisterInfo &MRI,
1767   MachineIRBuilder &B) const {
1768   const LLT S1 = LLT::scalar(1);
1769   const LLT S32 = LLT::scalar(32);
1770   const LLT S64 = LLT::scalar(64);
1771 
1772   Register Src = MI.getOperand(1).getReg();
1773   assert(MRI.getType(Src) == S64);
1774 
1775   // TODO: Should this use extract since the low half is unused?
1776   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1777   Register Hi = Unmerge.getReg(1);
1778 
1779   // Extract the upper half, since this is where we will find the sign and
1780   // exponent.
1781   auto Exp = extractF64Exponent(Hi, B);
1782 
1783   const unsigned FractBits = 52;
1784 
1785   // Extract the sign bit.
1786   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1787   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1788 
1789   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1790 
1791   const auto Zero32 = B.buildConstant(S32, 0);
1792 
1793   // Extend back to 64-bits.
1794   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1795 
1796   auto Shr = B.buildAShr(S64, FractMask, Exp);
1797   auto Not = B.buildNot(S64, Shr);
1798   auto Tmp0 = B.buildAnd(S64, Src, Not);
1799   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1800 
1801   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1802   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1803 
1804   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1805   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1806   return true;
1807 }
1808 
1809 bool AMDGPULegalizerInfo::legalizeITOFP(
1810   MachineInstr &MI, MachineRegisterInfo &MRI,
1811   MachineIRBuilder &B, bool Signed) const {
1812 
1813   Register Dst = MI.getOperand(0).getReg();
1814   Register Src = MI.getOperand(1).getReg();
1815 
1816   const LLT S64 = LLT::scalar(64);
1817   const LLT S32 = LLT::scalar(32);
1818 
1819   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1820 
1821   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1822 
1823   auto CvtHi = Signed ?
1824     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1825     B.buildUITOFP(S64, Unmerge.getReg(1));
1826 
1827   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1828 
1829   auto ThirtyTwo = B.buildConstant(S32, 32);
1830   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1831     .addUse(CvtHi.getReg(0))
1832     .addUse(ThirtyTwo.getReg(0));
1833 
1834   // TODO: Should this propagate fast-math-flags?
1835   B.buildFAdd(Dst, LdExp, CvtLo);
1836   MI.eraseFromParent();
1837   return true;
1838 }
1839 
1840 // TODO: Copied from DAG implementation. Verify logic and document how this
1841 // actually works.
1842 bool AMDGPULegalizerInfo::legalizeFPTOI(
1843   MachineInstr &MI, MachineRegisterInfo &MRI,
1844   MachineIRBuilder &B, bool Signed) const {
1845 
1846   Register Dst = MI.getOperand(0).getReg();
1847   Register Src = MI.getOperand(1).getReg();
1848 
1849   const LLT S64 = LLT::scalar(64);
1850   const LLT S32 = LLT::scalar(32);
1851 
1852   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1853 
1854   unsigned Flags = MI.getFlags();
1855 
1856   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1857   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1858   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1859 
1860   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1861   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1862   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1863 
1864   auto Hi = Signed ?
1865     B.buildFPTOSI(S32, FloorMul) :
1866     B.buildFPTOUI(S32, FloorMul);
1867   auto Lo = B.buildFPTOUI(S32, Fma);
1868 
1869   B.buildMerge(Dst, { Lo, Hi });
1870   MI.eraseFromParent();
1871 
1872   return true;
1873 }
1874 
1875 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1876                                                MachineInstr &MI) const {
1877   MachineFunction &MF = Helper.MIRBuilder.getMF();
1878   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1879 
1880   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1881                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1882 
1883   // With ieee_mode disabled, the instructions have the correct behavior
1884   // already for G_FMINNUM/G_FMAXNUM
1885   if (!MFI->getMode().IEEE)
1886     return !IsIEEEOp;
1887 
1888   if (IsIEEEOp)
1889     return true;
1890 
1891   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1892 }
1893 
1894 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1895   MachineInstr &MI, MachineRegisterInfo &MRI,
1896   MachineIRBuilder &B) const {
1897   // TODO: Should move some of this into LegalizerHelper.
1898 
1899   // TODO: Promote dynamic indexing of s16 to s32
1900 
1901   // FIXME: Artifact combiner probably should have replaced the truncated
1902   // constant before this, so we shouldn't need
1903   // getConstantVRegValWithLookThrough.
1904   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1905     MI.getOperand(2).getReg(), MRI);
1906   if (!IdxVal) // Dynamic case will be selected to register indexing.
1907     return true;
1908 
1909   Register Dst = MI.getOperand(0).getReg();
1910   Register Vec = MI.getOperand(1).getReg();
1911 
1912   LLT VecTy = MRI.getType(Vec);
1913   LLT EltTy = VecTy.getElementType();
1914   assert(EltTy == MRI.getType(Dst));
1915 
1916   if (IdxVal->Value < VecTy.getNumElements())
1917     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1918   else
1919     B.buildUndef(Dst);
1920 
1921   MI.eraseFromParent();
1922   return true;
1923 }
1924 
1925 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   // TODO: Should move some of this into LegalizerHelper.
1929 
1930   // TODO: Promote dynamic indexing of s16 to s32
1931 
1932   // FIXME: Artifact combiner probably should have replaced the truncated
1933   // constant before this, so we shouldn't need
1934   // getConstantVRegValWithLookThrough.
1935   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1936     MI.getOperand(3).getReg(), MRI);
1937   if (!IdxVal) // Dynamic case will be selected to register indexing.
1938     return true;
1939 
1940   Register Dst = MI.getOperand(0).getReg();
1941   Register Vec = MI.getOperand(1).getReg();
1942   Register Ins = MI.getOperand(2).getReg();
1943 
1944   LLT VecTy = MRI.getType(Vec);
1945   LLT EltTy = VecTy.getElementType();
1946   assert(EltTy == MRI.getType(Ins));
1947 
1948   if (IdxVal->Value < VecTy.getNumElements())
1949     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1950   else
1951     B.buildUndef(Dst);
1952 
1953   MI.eraseFromParent();
1954   return true;
1955 }
1956 
1957 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B) const {
1960   const LLT V2S16 = LLT::vector(2, 16);
1961 
1962   Register Dst = MI.getOperand(0).getReg();
1963   Register Src0 = MI.getOperand(1).getReg();
1964   LLT DstTy = MRI.getType(Dst);
1965   LLT SrcTy = MRI.getType(Src0);
1966 
1967   if (SrcTy == V2S16 && DstTy == V2S16 &&
1968       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1969     return true;
1970 
1971   MachineIRBuilder HelperBuilder(MI);
1972   GISelObserverWrapper DummyObserver;
1973   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1974   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeSinCos(
1978   MachineInstr &MI, MachineRegisterInfo &MRI,
1979   MachineIRBuilder &B) const {
1980 
1981   Register DstReg = MI.getOperand(0).getReg();
1982   Register SrcReg = MI.getOperand(1).getReg();
1983   LLT Ty = MRI.getType(DstReg);
1984   unsigned Flags = MI.getFlags();
1985 
1986   Register TrigVal;
1987   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1988   if (ST.hasTrigReducedRange()) {
1989     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1990     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1991       .addUse(MulVal.getReg(0))
1992       .setMIFlags(Flags).getReg(0);
1993   } else
1994     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1995 
1996   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1997     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1998   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1999     .addUse(TrigVal)
2000     .setMIFlags(Flags);
2001   MI.eraseFromParent();
2002   return true;
2003 }
2004 
2005 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2006                                                   MachineIRBuilder &B,
2007                                                   const GlobalValue *GV,
2008                                                   int64_t Offset,
2009                                                   unsigned GAFlags) const {
2010   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2011   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2012   // to the following code sequence:
2013   //
2014   // For constant address space:
2015   //   s_getpc_b64 s[0:1]
2016   //   s_add_u32 s0, s0, $symbol
2017   //   s_addc_u32 s1, s1, 0
2018   //
2019   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2020   //   a fixup or relocation is emitted to replace $symbol with a literal
2021   //   constant, which is a pc-relative offset from the encoding of the $symbol
2022   //   operand to the global variable.
2023   //
2024   // For global address space:
2025   //   s_getpc_b64 s[0:1]
2026   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2027   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2028   //
2029   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2030   //   fixups or relocations are emitted to replace $symbol@*@lo and
2031   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2032   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2033   //   operand to the global variable.
2034   //
2035   // What we want here is an offset from the value returned by s_getpc
2036   // (which is the address of the s_add_u32 instruction) to the global
2037   // variable, but since the encoding of $symbol starts 4 bytes after the start
2038   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2039   // small. This requires us to add 4 to the global variable offset in order to
2040   // compute the correct address.
2041 
2042   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2043 
2044   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2045     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2046 
2047   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2048     .addDef(PCReg);
2049 
2050   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2051   if (GAFlags == SIInstrInfo::MO_NONE)
2052     MIB.addImm(0);
2053   else
2054     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2055 
2056   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2057 
2058   if (PtrTy.getSizeInBits() == 32)
2059     B.buildExtract(DstReg, PCReg, 0);
2060   return true;
2061  }
2062 
2063 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2064   MachineInstr &MI, MachineRegisterInfo &MRI,
2065   MachineIRBuilder &B) const {
2066   Register DstReg = MI.getOperand(0).getReg();
2067   LLT Ty = MRI.getType(DstReg);
2068   unsigned AS = Ty.getAddressSpace();
2069 
2070   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2071   MachineFunction &MF = B.getMF();
2072   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2073 
2074   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2075     if (!MFI->isEntryFunction()) {
2076       const Function &Fn = MF.getFunction();
2077       DiagnosticInfoUnsupported BadLDSDecl(
2078         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2079         DS_Warning);
2080       Fn.getContext().diagnose(BadLDSDecl);
2081 
2082       // We currently don't have a way to correctly allocate LDS objects that
2083       // aren't directly associated with a kernel. We do force inlining of
2084       // functions that use local objects. However, if these dead functions are
2085       // not eliminated, we don't want a compile time error. Just emit a warning
2086       // and a trap, since there should be no callable path here.
2087       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2088       B.buildUndef(DstReg);
2089       MI.eraseFromParent();
2090       return true;
2091     }
2092 
2093     // TODO: We could emit code to handle the initialization somewhere.
2094     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2095       const SITargetLowering *TLI = ST.getTargetLowering();
2096       if (!TLI->shouldUseLDSConstAddress(GV)) {
2097         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2098         return true; // Leave in place;
2099       }
2100 
2101       B.buildConstant(
2102           DstReg,
2103           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2104       MI.eraseFromParent();
2105       return true;
2106     }
2107 
2108     const Function &Fn = MF.getFunction();
2109     DiagnosticInfoUnsupported BadInit(
2110       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2111     Fn.getContext().diagnose(BadInit);
2112     return true;
2113   }
2114 
2115   const SITargetLowering *TLI = ST.getTargetLowering();
2116 
2117   if (TLI->shouldEmitFixup(GV)) {
2118     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2119     MI.eraseFromParent();
2120     return true;
2121   }
2122 
2123   if (TLI->shouldEmitPCReloc(GV)) {
2124     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2125     MI.eraseFromParent();
2126     return true;
2127   }
2128 
2129   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2130   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2131 
2132   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2133       MachinePointerInfo::getGOT(MF),
2134       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2135           MachineMemOperand::MOInvariant,
2136       8 /*Size*/, Align(8));
2137 
2138   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2139 
2140   if (Ty.getSizeInBits() == 32) {
2141     // Truncate if this is a 32-bit constant adrdess.
2142     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2143     B.buildExtract(DstReg, Load, 0);
2144   } else
2145     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2146 
2147   MI.eraseFromParent();
2148   return true;
2149 }
2150 
2151 bool AMDGPULegalizerInfo::legalizeLoad(
2152   MachineInstr &MI, MachineRegisterInfo &MRI,
2153   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2154   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2155   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2156   Observer.changingInstr(MI);
2157   MI.getOperand(1).setReg(Cast.getReg(0));
2158   Observer.changedInstr(MI);
2159   return true;
2160 }
2161 
2162 bool AMDGPULegalizerInfo::legalizeFMad(
2163   MachineInstr &MI, MachineRegisterInfo &MRI,
2164   MachineIRBuilder &B) const {
2165   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2166   assert(Ty.isScalar());
2167 
2168   MachineFunction &MF = B.getMF();
2169   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2170 
2171   // TODO: Always legal with future ftz flag.
2172   // FIXME: Do we need just output?
2173   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2174     return true;
2175   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2176     return true;
2177 
2178   MachineIRBuilder HelperBuilder(MI);
2179   GISelObserverWrapper DummyObserver;
2180   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2181   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2185   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2186   Register DstReg = MI.getOperand(0).getReg();
2187   Register PtrReg = MI.getOperand(1).getReg();
2188   Register CmpVal = MI.getOperand(2).getReg();
2189   Register NewVal = MI.getOperand(3).getReg();
2190 
2191   assert(SITargetLowering::isFlatGlobalAddrSpace(
2192            MRI.getType(PtrReg).getAddressSpace()) &&
2193          "this should not have been custom lowered");
2194 
2195   LLT ValTy = MRI.getType(CmpVal);
2196   LLT VecTy = LLT::vector(2, ValTy);
2197 
2198   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2199 
2200   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2201     .addDef(DstReg)
2202     .addUse(PtrReg)
2203     .addUse(PackedVal)
2204     .setMemRefs(MI.memoperands());
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeFlog(
2211   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2212   Register Dst = MI.getOperand(0).getReg();
2213   Register Src = MI.getOperand(1).getReg();
2214   LLT Ty = B.getMRI()->getType(Dst);
2215   unsigned Flags = MI.getFlags();
2216 
2217   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2218   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2219 
2220   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2221   MI.eraseFromParent();
2222   return true;
2223 }
2224 
2225 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2226                                        MachineIRBuilder &B) const {
2227   Register Dst = MI.getOperand(0).getReg();
2228   Register Src = MI.getOperand(1).getReg();
2229   unsigned Flags = MI.getFlags();
2230   LLT Ty = B.getMRI()->getType(Dst);
2231 
2232   auto K = B.buildFConstant(Ty, numbers::log2e);
2233   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2234   B.buildFExp2(Dst, Mul, Flags);
2235   MI.eraseFromParent();
2236   return true;
2237 }
2238 
2239 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2240                                        MachineIRBuilder &B) const {
2241   Register Dst = MI.getOperand(0).getReg();
2242   Register Src0 = MI.getOperand(1).getReg();
2243   Register Src1 = MI.getOperand(2).getReg();
2244   unsigned Flags = MI.getFlags();
2245   LLT Ty = B.getMRI()->getType(Dst);
2246   const LLT S16 = LLT::scalar(16);
2247   const LLT S32 = LLT::scalar(32);
2248 
2249   if (Ty == S32) {
2250     auto Log = B.buildFLog2(S32, Src0, Flags);
2251     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2252       .addUse(Log.getReg(0))
2253       .addUse(Src1)
2254       .setMIFlags(Flags);
2255     B.buildFExp2(Dst, Mul, Flags);
2256   } else if (Ty == S16) {
2257     // There's no f16 fmul_legacy, so we need to convert for it.
2258     auto Log = B.buildFLog2(S16, Src0, Flags);
2259     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2260     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2261     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2262       .addUse(Ext0.getReg(0))
2263       .addUse(Ext1.getReg(0))
2264       .setMIFlags(Flags);
2265 
2266     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2267   } else
2268     return false;
2269 
2270   MI.eraseFromParent();
2271   return true;
2272 }
2273 
2274 // Find a source register, ignoring any possible source modifiers.
2275 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2276   Register ModSrc = OrigSrc;
2277   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2278     ModSrc = SrcFNeg->getOperand(1).getReg();
2279     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2280       ModSrc = SrcFAbs->getOperand(1).getReg();
2281   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2282     ModSrc = SrcFAbs->getOperand(1).getReg();
2283   return ModSrc;
2284 }
2285 
2286 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2287                                          MachineRegisterInfo &MRI,
2288                                          MachineIRBuilder &B) const {
2289 
2290   const LLT S1 = LLT::scalar(1);
2291   const LLT S64 = LLT::scalar(64);
2292   Register Dst = MI.getOperand(0).getReg();
2293   Register OrigSrc = MI.getOperand(1).getReg();
2294   unsigned Flags = MI.getFlags();
2295   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2296          "this should not have been custom lowered");
2297 
2298   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2299   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2300   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2301   // V_FRACT bug is:
2302   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2303   //
2304   // Convert floor(x) to (x - fract(x))
2305 
2306   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2307     .addUse(OrigSrc)
2308     .setMIFlags(Flags);
2309 
2310   // Give source modifier matching some assistance before obscuring a foldable
2311   // pattern.
2312 
2313   // TODO: We can avoid the neg on the fract? The input sign to fract
2314   // shouldn't matter?
2315   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2316 
2317   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2318 
2319   Register Min = MRI.createGenericVirtualRegister(S64);
2320 
2321   // We don't need to concern ourselves with the snan handling difference, so
2322   // use the one which will directly select.
2323   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2324   if (MFI->getMode().IEEE)
2325     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2326   else
2327     B.buildFMinNum(Min, Fract, Const, Flags);
2328 
2329   Register CorrectedFract = Min;
2330   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2331     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2332     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2333   }
2334 
2335   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2336   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2337 
2338   MI.eraseFromParent();
2339   return true;
2340 }
2341 
2342 // Turn an illegal packed v2s16 build vector into bit operations.
2343 // TODO: This should probably be a bitcast action in LegalizerHelper.
2344 bool AMDGPULegalizerInfo::legalizeBuildVector(
2345   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2346   Register Dst = MI.getOperand(0).getReg();
2347   const LLT S32 = LLT::scalar(32);
2348   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2349 
2350   Register Src0 = MI.getOperand(1).getReg();
2351   Register Src1 = MI.getOperand(2).getReg();
2352   assert(MRI.getType(Src0) == LLT::scalar(16));
2353 
2354   auto Merge = B.buildMerge(S32, {Src0, Src1});
2355   B.buildBitcast(Dst, Merge);
2356 
2357   MI.eraseFromParent();
2358   return true;
2359 }
2360 
2361 // Return the use branch instruction, otherwise null if the usage is invalid.
2362 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2363                                        MachineRegisterInfo &MRI,
2364                                        MachineInstr *&Br,
2365                                        MachineBasicBlock *&UncondBrTarget) {
2366   Register CondDef = MI.getOperand(0).getReg();
2367   if (!MRI.hasOneNonDBGUse(CondDef))
2368     return nullptr;
2369 
2370   MachineBasicBlock *Parent = MI.getParent();
2371   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2372   if (UseMI.getParent() != Parent ||
2373       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2374     return nullptr;
2375 
2376   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2377   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2378   if (Next == Parent->end()) {
2379     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2380     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2381       return nullptr;
2382     UncondBrTarget = &*NextMBB;
2383   } else {
2384     if (Next->getOpcode() != AMDGPU::G_BR)
2385       return nullptr;
2386     Br = &*Next;
2387     UncondBrTarget = Br->getOperand(0).getMBB();
2388   }
2389 
2390   return &UseMI;
2391 }
2392 
2393 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2394                                                MachineRegisterInfo &MRI,
2395                                                Register LiveIn,
2396                                                Register PhyReg) const {
2397   assert(PhyReg.isPhysical() && "Physical register expected");
2398 
2399   // Insert the live-in copy, if required, by defining destination virtual
2400   // register.
2401   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2402   if (!MRI.getVRegDef(LiveIn)) {
2403     // FIXME: Should have scoped insert pt
2404     MachineBasicBlock &OrigInsBB = B.getMBB();
2405     auto OrigInsPt = B.getInsertPt();
2406 
2407     MachineBasicBlock &EntryMBB = B.getMF().front();
2408     EntryMBB.addLiveIn(PhyReg);
2409     B.setInsertPt(EntryMBB, EntryMBB.begin());
2410     B.buildCopy(LiveIn, PhyReg);
2411 
2412     B.setInsertPt(OrigInsBB, OrigInsPt);
2413   }
2414 
2415   return LiveIn;
2416 }
2417 
2418 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2419                                                 MachineRegisterInfo &MRI,
2420                                                 Register PhyReg, LLT Ty,
2421                                                 bool InsertLiveInCopy) const {
2422   assert(PhyReg.isPhysical() && "Physical register expected");
2423 
2424   // Get or create virtual live-in regester
2425   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2426   if (!LiveIn) {
2427     LiveIn = MRI.createGenericVirtualRegister(Ty);
2428     MRI.addLiveIn(PhyReg, LiveIn);
2429   }
2430 
2431   // When the actual true copy required is from virtual register to physical
2432   // register (to be inserted later), live-in copy insertion from physical
2433   // to register virtual register is not required
2434   if (!InsertLiveInCopy)
2435     return LiveIn;
2436 
2437   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2438 }
2439 
2440 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2441     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2442   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2443   const ArgDescriptor *Arg;
2444   const TargetRegisterClass *RC;
2445   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2446   if (!Arg) {
2447     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2448     return nullptr;
2449   }
2450   return Arg;
2451 }
2452 
2453 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2454                                          const ArgDescriptor *Arg) const {
2455   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2456     return false; // TODO: Handle these
2457 
2458   Register SrcReg = Arg->getRegister();
2459   assert(SrcReg.isPhysical() && "Physical register expected");
2460   assert(DstReg.isVirtual() && "Virtual register expected");
2461 
2462   MachineRegisterInfo &MRI = *B.getMRI();
2463 
2464   LLT Ty = MRI.getType(DstReg);
2465   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2466 
2467   if (Arg->isMasked()) {
2468     // TODO: Should we try to emit this once in the entry block?
2469     const LLT S32 = LLT::scalar(32);
2470     const unsigned Mask = Arg->getMask();
2471     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2472 
2473     Register AndMaskSrc = LiveIn;
2474 
2475     if (Shift != 0) {
2476       auto ShiftAmt = B.buildConstant(S32, Shift);
2477       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2478     }
2479 
2480     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2481   } else {
2482     B.buildCopy(DstReg, LiveIn);
2483   }
2484 
2485   return true;
2486 }
2487 
2488 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2489     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2490     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2491 
2492   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2493   if (!Arg)
2494     return false;
2495 
2496   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2497     return false;
2498 
2499   MI.eraseFromParent();
2500   return true;
2501 }
2502 
2503 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2504                                        MachineRegisterInfo &MRI,
2505                                        MachineIRBuilder &B) const {
2506   Register Dst = MI.getOperand(0).getReg();
2507   LLT DstTy = MRI.getType(Dst);
2508   LLT S16 = LLT::scalar(16);
2509   LLT S32 = LLT::scalar(32);
2510   LLT S64 = LLT::scalar(64);
2511 
2512   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2513     return true;
2514 
2515   if (DstTy == S16)
2516     return legalizeFDIV16(MI, MRI, B);
2517   if (DstTy == S32)
2518     return legalizeFDIV32(MI, MRI, B);
2519   if (DstTy == S64)
2520     return legalizeFDIV64(MI, MRI, B);
2521 
2522   return false;
2523 }
2524 
2525 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2526   const LLT S32 = LLT::scalar(32);
2527 
2528   auto Cvt0 = B.buildUITOFP(S32, Src);
2529   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2530   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2531   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2532   return B.buildFPTOUI(S32, Mul).getReg(0);
2533 }
2534 
2535 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2536                                                   Register DstReg,
2537                                                   Register Num,
2538                                                   Register Den,
2539                                                   bool IsDiv) const {
2540   const LLT S1 = LLT::scalar(1);
2541   const LLT S32 = LLT::scalar(32);
2542 
2543   // RCP =  URECIP(Den) = 2^32 / Den + e
2544   // e is rounding error.
2545   auto RCP = buildDivRCP(B, Den);
2546 
2547   // RCP_LO = mul(RCP, Den)
2548   auto RCP_LO = B.buildMul(S32, RCP, Den);
2549 
2550   // RCP_HI = mulhu (RCP, Den) */
2551   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2552 
2553   // NEG_RCP_LO = -RCP_LO
2554   auto Zero = B.buildConstant(S32, 0);
2555   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2556 
2557   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2558   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2559   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2560 
2561   // Calculate the rounding error from the URECIP instruction
2562   // E = mulhu(ABS_RCP_LO, RCP)
2563   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2564 
2565   // RCP_A_E = RCP + E
2566   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2567 
2568   // RCP_S_E = RCP - E
2569   auto RCP_S_E = B.buildSub(S32, RCP, E);
2570 
2571   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2572   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2573 
2574   // Quotient = mulhu(Tmp0, Num)stmp
2575   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2576 
2577   // Num_S_Remainder = Quotient * Den
2578   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2579 
2580   // Remainder = Num - Num_S_Remainder
2581   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2582 
2583   // Remainder_GE_Den = Remainder >= Den
2584   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2585 
2586   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2587   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2588                                        Num, Num_S_Remainder);
2589 
2590   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2591   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2592 
2593   // Calculate Division result:
2594 
2595   // Quotient_A_One = Quotient + 1
2596   auto One = B.buildConstant(S32, 1);
2597   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2598 
2599   // Quotient_S_One = Quotient - 1
2600   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2601 
2602   // Div = (Tmp1 ? Quotient_A_One : Quotient)
2603   auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient);
2604 
2605   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2606   if (IsDiv) {
2607     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2608   } else {
2609     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2610 
2611     // Calculate Rem result:
2612     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2613 
2614     // Remainder_A_Den = Remainder + Den
2615     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2616 
2617     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2618     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2619 
2620     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2621     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2622   }
2623 }
2624 
2625 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2626                                               MachineRegisterInfo &MRI,
2627                                               MachineIRBuilder &B) const {
2628   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2629   Register DstReg = MI.getOperand(0).getReg();
2630   Register Num = MI.getOperand(1).getReg();
2631   Register Den = MI.getOperand(2).getReg();
2632   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2633   MI.eraseFromParent();
2634   return true;
2635 }
2636 
2637 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2638 //
2639 // Return lo, hi of result
2640 //
2641 // %cvt.lo = G_UITOFP Val.lo
2642 // %cvt.hi = G_UITOFP Val.hi
2643 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2644 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2645 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2646 // %mul2 = G_FMUL %mul1, 2**(-32)
2647 // %trunc = G_INTRINSIC_TRUNC %mul2
2648 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2649 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2650 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2651                                                        Register Val) {
2652   const LLT S32 = LLT::scalar(32);
2653   auto Unmerge = B.buildUnmerge(S32, Val);
2654 
2655   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2656   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2657 
2658   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2659                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2660 
2661   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2662   auto Mul1 =
2663       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2664 
2665   // 2**(-32)
2666   auto Mul2 =
2667       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2668   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2669 
2670   // -(2**32)
2671   auto Mad2 = B.buildFMAD(S32, Trunc,
2672                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2673 
2674   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2675   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2676 
2677   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2678 }
2679 
2680 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2681                                                   Register DstReg,
2682                                                   Register Numer,
2683                                                   Register Denom,
2684                                                   bool IsDiv) const {
2685   const LLT S32 = LLT::scalar(32);
2686   const LLT S64 = LLT::scalar(64);
2687   const LLT S1 = LLT::scalar(1);
2688   Register RcpLo, RcpHi;
2689 
2690   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2691 
2692   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2693 
2694   auto Zero64 = B.buildConstant(S64, 0);
2695   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2696 
2697   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2698   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2699 
2700   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2701   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2702   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2703 
2704   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2705   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2706   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2707   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2708 
2709   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2710   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2711   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2712   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2713   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2714 
2715   auto Zero32 = B.buildConstant(S32, 0);
2716   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2717   auto Add2_HiC =
2718       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2719   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2720   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2721 
2722   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2723   Register NumerLo = UnmergeNumer.getReg(0);
2724   Register NumerHi = UnmergeNumer.getReg(1);
2725 
2726   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2727   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2728   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2729   Register Mul3_Lo = UnmergeMul3.getReg(0);
2730   Register Mul3_Hi = UnmergeMul3.getReg(1);
2731   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2732   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2733   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2734   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2735 
2736   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2737   Register DenomLo = UnmergeDenom.getReg(0);
2738   Register DenomHi = UnmergeDenom.getReg(1);
2739 
2740   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2741   auto C1 = B.buildSExt(S32, CmpHi);
2742 
2743   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2744   auto C2 = B.buildSExt(S32, CmpLo);
2745 
2746   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2747   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2748 
2749   // TODO: Here and below portions of the code can be enclosed into if/endif.
2750   // Currently control flow is unconditional and we have 4 selects after
2751   // potential endif to substitute PHIs.
2752 
2753   // if C3 != 0 ...
2754   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2755   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2756   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2757   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2758 
2759   auto One64 = B.buildConstant(S64, 1);
2760   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2761 
2762   auto C4 =
2763       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2764   auto C5 =
2765       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2766   auto C6 = B.buildSelect(
2767       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2768 
2769   // if (C6 != 0)
2770   auto Add4 = B.buildAdd(S64, Add3, One64);
2771   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2772 
2773   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2774   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2775   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2776 
2777   // endif C6
2778   // endif C3
2779 
2780   if (IsDiv) {
2781     auto Sel1 = B.buildSelect(
2782         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2783     B.buildSelect(DstReg,
2784                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2785   } else {
2786     auto Sel2 = B.buildSelect(
2787         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2788     B.buildSelect(DstReg,
2789                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2790   }
2791 }
2792 
2793 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2794                                             MachineRegisterInfo &MRI,
2795                                             MachineIRBuilder &B) const {
2796   const LLT S64 = LLT::scalar(64);
2797   const LLT S32 = LLT::scalar(32);
2798   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2799   Register DstReg = MI.getOperand(0).getReg();
2800   Register Num = MI.getOperand(1).getReg();
2801   Register Den = MI.getOperand(2).getReg();
2802   LLT Ty = MRI.getType(DstReg);
2803 
2804   if (Ty == S32)
2805     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2806   else if (Ty == S64)
2807     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2808   else
2809     return false;
2810 
2811   MI.eraseFromParent();
2812   return true;
2813 
2814 }
2815 
2816 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2817                                             MachineRegisterInfo &MRI,
2818                                             MachineIRBuilder &B) const {
2819   const LLT S64 = LLT::scalar(64);
2820   const LLT S32 = LLT::scalar(32);
2821 
2822   Register DstReg = MI.getOperand(0).getReg();
2823   const LLT Ty = MRI.getType(DstReg);
2824   if (Ty != S32 && Ty != S64)
2825     return false;
2826 
2827   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2828 
2829   Register LHS = MI.getOperand(1).getReg();
2830   Register RHS = MI.getOperand(2).getReg();
2831 
2832   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2833   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2834   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2835 
2836   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2837   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2838 
2839   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2840   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2841 
2842   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2843   if (Ty == S32)
2844     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2845   else
2846     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2847 
2848   Register Sign;
2849   if (IsDiv)
2850     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2851   else
2852     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2853 
2854   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2855   B.buildSub(DstReg, UDivRem, Sign);
2856 
2857   MI.eraseFromParent();
2858   return true;
2859 }
2860 
2861 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2862                                                  MachineRegisterInfo &MRI,
2863                                                  MachineIRBuilder &B) const {
2864   Register Res = MI.getOperand(0).getReg();
2865   Register LHS = MI.getOperand(1).getReg();
2866   Register RHS = MI.getOperand(2).getReg();
2867 
2868   uint16_t Flags = MI.getFlags();
2869 
2870   LLT ResTy = MRI.getType(Res);
2871   LLT S32 = LLT::scalar(32);
2872   LLT S64 = LLT::scalar(64);
2873 
2874   const MachineFunction &MF = B.getMF();
2875   bool Unsafe =
2876     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2877 
2878   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2879     return false;
2880 
2881   if (!Unsafe && ResTy == S32 &&
2882       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2883     return false;
2884 
2885   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2886     // 1 / x -> RCP(x)
2887     if (CLHS->isExactlyValue(1.0)) {
2888       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2889         .addUse(RHS)
2890         .setMIFlags(Flags);
2891 
2892       MI.eraseFromParent();
2893       return true;
2894     }
2895 
2896     // -1 / x -> RCP( FNEG(x) )
2897     if (CLHS->isExactlyValue(-1.0)) {
2898       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2899       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2900         .addUse(FNeg.getReg(0))
2901         .setMIFlags(Flags);
2902 
2903       MI.eraseFromParent();
2904       return true;
2905     }
2906   }
2907 
2908   // x / y -> x * (1.0 / y)
2909   if (Unsafe) {
2910     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2911       .addUse(RHS)
2912       .setMIFlags(Flags);
2913     B.buildFMul(Res, LHS, RCP, Flags);
2914 
2915     MI.eraseFromParent();
2916     return true;
2917   }
2918 
2919   return false;
2920 }
2921 
2922 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2923                                          MachineRegisterInfo &MRI,
2924                                          MachineIRBuilder &B) const {
2925   Register Res = MI.getOperand(0).getReg();
2926   Register LHS = MI.getOperand(1).getReg();
2927   Register RHS = MI.getOperand(2).getReg();
2928 
2929   uint16_t Flags = MI.getFlags();
2930 
2931   LLT S16 = LLT::scalar(16);
2932   LLT S32 = LLT::scalar(32);
2933 
2934   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2935   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2936 
2937   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2938     .addUse(RHSExt.getReg(0))
2939     .setMIFlags(Flags);
2940 
2941   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2942   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2943 
2944   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2945     .addUse(RDst.getReg(0))
2946     .addUse(RHS)
2947     .addUse(LHS)
2948     .setMIFlags(Flags);
2949 
2950   MI.eraseFromParent();
2951   return true;
2952 }
2953 
2954 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2955 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2956 static void toggleSPDenormMode(bool Enable,
2957                                MachineIRBuilder &B,
2958                                const GCNSubtarget &ST,
2959                                AMDGPU::SIModeRegisterDefaults Mode) {
2960   // Set SP denorm mode to this value.
2961   unsigned SPDenormMode =
2962     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2963 
2964   if (ST.hasDenormModeInst()) {
2965     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2966     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2967 
2968     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2969     B.buildInstr(AMDGPU::S_DENORM_MODE)
2970       .addImm(NewDenormModeValue);
2971 
2972   } else {
2973     // Select FP32 bit field in mode register.
2974     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2975                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2976                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2977 
2978     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2979       .addImm(SPDenormMode)
2980       .addImm(SPDenormModeBitField);
2981   }
2982 }
2983 
2984 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2985                                          MachineRegisterInfo &MRI,
2986                                          MachineIRBuilder &B) const {
2987   Register Res = MI.getOperand(0).getReg();
2988   Register LHS = MI.getOperand(1).getReg();
2989   Register RHS = MI.getOperand(2).getReg();
2990   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2991   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2992 
2993   uint16_t Flags = MI.getFlags();
2994 
2995   LLT S32 = LLT::scalar(32);
2996   LLT S1 = LLT::scalar(1);
2997 
2998   auto One = B.buildFConstant(S32, 1.0f);
2999 
3000   auto DenominatorScaled =
3001     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3002       .addUse(LHS)
3003       .addUse(RHS)
3004       .addImm(0)
3005       .setMIFlags(Flags);
3006   auto NumeratorScaled =
3007     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3008       .addUse(LHS)
3009       .addUse(RHS)
3010       .addImm(1)
3011       .setMIFlags(Flags);
3012 
3013   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3014     .addUse(DenominatorScaled.getReg(0))
3015     .setMIFlags(Flags);
3016   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3017 
3018   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3019   // aren't modeled as reading it.
3020   if (!Mode.allFP32Denormals())
3021     toggleSPDenormMode(true, B, ST, Mode);
3022 
3023   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3024   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3025   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3026   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3027   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3028   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3029 
3030   if (!Mode.allFP32Denormals())
3031     toggleSPDenormMode(false, B, ST, Mode);
3032 
3033   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3034     .addUse(Fma4.getReg(0))
3035     .addUse(Fma1.getReg(0))
3036     .addUse(Fma3.getReg(0))
3037     .addUse(NumeratorScaled.getReg(1))
3038     .setMIFlags(Flags);
3039 
3040   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3041     .addUse(Fmas.getReg(0))
3042     .addUse(RHS)
3043     .addUse(LHS)
3044     .setMIFlags(Flags);
3045 
3046   MI.eraseFromParent();
3047   return true;
3048 }
3049 
3050 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3051                                          MachineRegisterInfo &MRI,
3052                                          MachineIRBuilder &B) const {
3053   Register Res = MI.getOperand(0).getReg();
3054   Register LHS = MI.getOperand(1).getReg();
3055   Register RHS = MI.getOperand(2).getReg();
3056 
3057   uint16_t Flags = MI.getFlags();
3058 
3059   LLT S64 = LLT::scalar(64);
3060   LLT S1 = LLT::scalar(1);
3061 
3062   auto One = B.buildFConstant(S64, 1.0);
3063 
3064   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3065     .addUse(LHS)
3066     .addUse(RHS)
3067     .addImm(0)
3068     .setMIFlags(Flags);
3069 
3070   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3071 
3072   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3073     .addUse(DivScale0.getReg(0))
3074     .setMIFlags(Flags);
3075 
3076   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3077   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3078   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3079 
3080   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3081     .addUse(LHS)
3082     .addUse(RHS)
3083     .addImm(1)
3084     .setMIFlags(Flags);
3085 
3086   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3087   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3088   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3089 
3090   Register Scale;
3091   if (!ST.hasUsableDivScaleConditionOutput()) {
3092     // Workaround a hardware bug on SI where the condition output from div_scale
3093     // is not usable.
3094 
3095     LLT S32 = LLT::scalar(32);
3096 
3097     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3098     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3099     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3100     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3101 
3102     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3103                               Scale1Unmerge.getReg(1));
3104     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3105                               Scale0Unmerge.getReg(1));
3106     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3107   } else {
3108     Scale = DivScale1.getReg(1);
3109   }
3110 
3111   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3112     .addUse(Fma4.getReg(0))
3113     .addUse(Fma3.getReg(0))
3114     .addUse(Mul.getReg(0))
3115     .addUse(Scale)
3116     .setMIFlags(Flags);
3117 
3118   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3119     .addUse(Fmas.getReg(0))
3120     .addUse(RHS)
3121     .addUse(LHS)
3122     .setMIFlags(Flags);
3123 
3124   MI.eraseFromParent();
3125   return true;
3126 }
3127 
3128 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3129                                                  MachineRegisterInfo &MRI,
3130                                                  MachineIRBuilder &B) const {
3131   Register Res = MI.getOperand(0).getReg();
3132   Register LHS = MI.getOperand(2).getReg();
3133   Register RHS = MI.getOperand(3).getReg();
3134   uint16_t Flags = MI.getFlags();
3135 
3136   LLT S32 = LLT::scalar(32);
3137   LLT S1 = LLT::scalar(1);
3138 
3139   auto Abs = B.buildFAbs(S32, RHS, Flags);
3140   const APFloat C0Val(1.0f);
3141 
3142   auto C0 = B.buildConstant(S32, 0x6f800000);
3143   auto C1 = B.buildConstant(S32, 0x2f800000);
3144   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3145 
3146   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3147   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3148 
3149   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3150 
3151   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3152     .addUse(Mul0.getReg(0))
3153     .setMIFlags(Flags);
3154 
3155   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3156 
3157   B.buildFMul(Res, Sel, Mul1, Flags);
3158 
3159   MI.eraseFromParent();
3160   return true;
3161 }
3162 
3163 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3164                                                  MachineRegisterInfo &MRI,
3165                                                  MachineIRBuilder &B) const {
3166   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3167   if (!MFI->isEntryFunction()) {
3168     return legalizePreloadedArgIntrin(MI, MRI, B,
3169                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3170   }
3171 
3172   uint64_t Offset =
3173     ST.getTargetLowering()->getImplicitParameterOffset(
3174       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3175   Register DstReg = MI.getOperand(0).getReg();
3176   LLT DstTy = MRI.getType(DstReg);
3177   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3178 
3179   const ArgDescriptor *Arg;
3180   const TargetRegisterClass *RC;
3181   std::tie(Arg, RC)
3182     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3183   if (!Arg)
3184     return false;
3185 
3186   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3187   if (!loadInputValue(KernargPtrReg, B, Arg))
3188     return false;
3189 
3190   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3191   MI.eraseFromParent();
3192   return true;
3193 }
3194 
3195 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3196                                               MachineRegisterInfo &MRI,
3197                                               MachineIRBuilder &B,
3198                                               unsigned AddrSpace) const {
3199   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3200   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3201   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3202   MI.eraseFromParent();
3203   return true;
3204 }
3205 
3206 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3207 // offset (the offset that is included in bounds checking and swizzling, to be
3208 // split between the instruction's voffset and immoffset fields) and soffset
3209 // (the offset that is excluded from bounds checking and swizzling, to go in
3210 // the instruction's soffset field).  This function takes the first kind of
3211 // offset and figures out how to split it between voffset and immoffset.
3212 std::tuple<Register, unsigned, unsigned>
3213 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3214                                         Register OrigOffset) const {
3215   const unsigned MaxImm = 4095;
3216   Register BaseReg;
3217   unsigned TotalConstOffset;
3218   MachineInstr *OffsetDef;
3219   const LLT S32 = LLT::scalar(32);
3220 
3221   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3222     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3223 
3224   unsigned ImmOffset = TotalConstOffset;
3225 
3226   // If the immediate value is too big for the immoffset field, put the value
3227   // and -4096 into the immoffset field so that the value that is copied/added
3228   // for the voffset field is a multiple of 4096, and it stands more chance
3229   // of being CSEd with the copy/add for another similar load/store.
3230   // However, do not do that rounding down to a multiple of 4096 if that is a
3231   // negative number, as it appears to be illegal to have a negative offset
3232   // in the vgpr, even if adding the immediate offset makes it positive.
3233   unsigned Overflow = ImmOffset & ~MaxImm;
3234   ImmOffset -= Overflow;
3235   if ((int32_t)Overflow < 0) {
3236     Overflow += ImmOffset;
3237     ImmOffset = 0;
3238   }
3239 
3240   if (Overflow != 0) {
3241     if (!BaseReg) {
3242       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3243     } else {
3244       auto OverflowVal = B.buildConstant(S32, Overflow);
3245       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3246     }
3247   }
3248 
3249   if (!BaseReg)
3250     BaseReg = B.buildConstant(S32, 0).getReg(0);
3251 
3252   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3253 }
3254 
3255 /// Handle register layout difference for f16 images for some subtargets.
3256 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3257                                              MachineRegisterInfo &MRI,
3258                                              Register Reg) const {
3259   if (!ST.hasUnpackedD16VMem())
3260     return Reg;
3261 
3262   const LLT S16 = LLT::scalar(16);
3263   const LLT S32 = LLT::scalar(32);
3264   LLT StoreVT = MRI.getType(Reg);
3265   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3266 
3267   auto Unmerge = B.buildUnmerge(S16, Reg);
3268 
3269   SmallVector<Register, 4> WideRegs;
3270   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3271     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3272 
3273   int NumElts = StoreVT.getNumElements();
3274 
3275   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3276 }
3277 
3278 Register AMDGPULegalizerInfo::fixStoreSourceType(
3279   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3280   MachineRegisterInfo *MRI = B.getMRI();
3281   LLT Ty = MRI->getType(VData);
3282 
3283   const LLT S16 = LLT::scalar(16);
3284 
3285   // Fixup illegal register types for i8 stores.
3286   if (Ty == LLT::scalar(8) || Ty == S16) {
3287     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3288     return AnyExt;
3289   }
3290 
3291   if (Ty.isVector()) {
3292     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3293       if (IsFormat)
3294         return handleD16VData(B, *MRI, VData);
3295     }
3296   }
3297 
3298   return VData;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3302                                               MachineRegisterInfo &MRI,
3303                                               MachineIRBuilder &B,
3304                                               bool IsTyped,
3305                                               bool IsFormat) const {
3306   Register VData = MI.getOperand(1).getReg();
3307   LLT Ty = MRI.getType(VData);
3308   LLT EltTy = Ty.getScalarType();
3309   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3310   const LLT S32 = LLT::scalar(32);
3311 
3312   VData = fixStoreSourceType(B, VData, IsFormat);
3313   Register RSrc = MI.getOperand(2).getReg();
3314 
3315   MachineMemOperand *MMO = *MI.memoperands_begin();
3316   const int MemSize = MMO->getSize();
3317 
3318   unsigned ImmOffset;
3319   unsigned TotalOffset;
3320 
3321   // The typed intrinsics add an immediate after the registers.
3322   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3323 
3324   // The struct intrinsic variants add one additional operand over raw.
3325   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3326   Register VIndex;
3327   int OpOffset = 0;
3328   if (HasVIndex) {
3329     VIndex = MI.getOperand(3).getReg();
3330     OpOffset = 1;
3331   }
3332 
3333   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3334   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3335 
3336   unsigned Format = 0;
3337   if (IsTyped) {
3338     Format = MI.getOperand(5 + OpOffset).getImm();
3339     ++OpOffset;
3340   }
3341 
3342   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3343 
3344   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3345   if (TotalOffset != 0)
3346     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3347 
3348   unsigned Opc;
3349   if (IsTyped) {
3350     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3351                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3352   } else if (IsFormat) {
3353     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3354                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3355   } else {
3356     switch (MemSize) {
3357     case 1:
3358       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3359       break;
3360     case 2:
3361       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3362       break;
3363     default:
3364       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3365       break;
3366     }
3367   }
3368 
3369   if (!VIndex)
3370     VIndex = B.buildConstant(S32, 0).getReg(0);
3371 
3372   auto MIB = B.buildInstr(Opc)
3373     .addUse(VData)              // vdata
3374     .addUse(RSrc)               // rsrc
3375     .addUse(VIndex)             // vindex
3376     .addUse(VOffset)            // voffset
3377     .addUse(SOffset)            // soffset
3378     .addImm(ImmOffset);         // offset(imm)
3379 
3380   if (IsTyped)
3381     MIB.addImm(Format);
3382 
3383   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3384      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3385      .addMemOperand(MMO);
3386 
3387   MI.eraseFromParent();
3388   return true;
3389 }
3390 
3391 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3392                                              MachineRegisterInfo &MRI,
3393                                              MachineIRBuilder &B,
3394                                              bool IsFormat,
3395                                              bool IsTyped) const {
3396   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3397   MachineMemOperand *MMO = *MI.memoperands_begin();
3398   const int MemSize = MMO->getSize();
3399   const LLT S32 = LLT::scalar(32);
3400 
3401   Register Dst = MI.getOperand(0).getReg();
3402   Register RSrc = MI.getOperand(2).getReg();
3403 
3404   // The typed intrinsics add an immediate after the registers.
3405   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3406 
3407   // The struct intrinsic variants add one additional operand over raw.
3408   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3409   Register VIndex;
3410   int OpOffset = 0;
3411   if (HasVIndex) {
3412     VIndex = MI.getOperand(3).getReg();
3413     OpOffset = 1;
3414   }
3415 
3416   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3417   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3418 
3419   unsigned Format = 0;
3420   if (IsTyped) {
3421     Format = MI.getOperand(5 + OpOffset).getImm();
3422     ++OpOffset;
3423   }
3424 
3425   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3426   unsigned ImmOffset;
3427   unsigned TotalOffset;
3428 
3429   LLT Ty = MRI.getType(Dst);
3430   LLT EltTy = Ty.getScalarType();
3431   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3432   const bool Unpacked = ST.hasUnpackedD16VMem();
3433 
3434   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3435   if (TotalOffset != 0)
3436     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3437 
3438   unsigned Opc;
3439 
3440   if (IsTyped) {
3441     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3442                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3443   } else if (IsFormat) {
3444     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3445                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3446   } else {
3447     switch (MemSize) {
3448     case 1:
3449       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3450       break;
3451     case 2:
3452       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3453       break;
3454     default:
3455       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3456       break;
3457     }
3458   }
3459 
3460   Register LoadDstReg;
3461 
3462   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3463   LLT UnpackedTy = Ty.changeElementSize(32);
3464 
3465   if (IsExtLoad)
3466     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3467   else if (Unpacked && IsD16 && Ty.isVector())
3468     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3469   else
3470     LoadDstReg = Dst;
3471 
3472   if (!VIndex)
3473     VIndex = B.buildConstant(S32, 0).getReg(0);
3474 
3475   auto MIB = B.buildInstr(Opc)
3476     .addDef(LoadDstReg)         // vdata
3477     .addUse(RSrc)               // rsrc
3478     .addUse(VIndex)             // vindex
3479     .addUse(VOffset)            // voffset
3480     .addUse(SOffset)            // soffset
3481     .addImm(ImmOffset);         // offset(imm)
3482 
3483   if (IsTyped)
3484     MIB.addImm(Format);
3485 
3486   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3487      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3488      .addMemOperand(MMO);
3489 
3490   if (LoadDstReg != Dst) {
3491     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3492 
3493     // Widen result for extending loads was widened.
3494     if (IsExtLoad)
3495       B.buildTrunc(Dst, LoadDstReg);
3496     else {
3497       // Repack to original 16-bit vector result
3498       // FIXME: G_TRUNC should work, but legalization currently fails
3499       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3500       SmallVector<Register, 4> Repack;
3501       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3502         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3503       B.buildMerge(Dst, Repack);
3504     }
3505   }
3506 
3507   MI.eraseFromParent();
3508   return true;
3509 }
3510 
3511 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3512                                                MachineIRBuilder &B,
3513                                                bool IsInc) const {
3514   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3515                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3516   B.buildInstr(Opc)
3517     .addDef(MI.getOperand(0).getReg())
3518     .addUse(MI.getOperand(2).getReg())
3519     .addUse(MI.getOperand(3).getReg())
3520     .cloneMemRefs(MI);
3521   MI.eraseFromParent();
3522   return true;
3523 }
3524 
3525 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3526   switch (IntrID) {
3527   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3528   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3529     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3530   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3531   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3532     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3533   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3534   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3535     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3536   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3537   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3538     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3539   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3540   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3541     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3542   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3543   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3544     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3545   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3546   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3547     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3548   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3549   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3550     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3551   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3552   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3553     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3554   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3555   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3556     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3557   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3558   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3559     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3560   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3561   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3562     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3563   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3564   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3565     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3566   default:
3567     llvm_unreachable("unhandled atomic opcode");
3568   }
3569 }
3570 
3571 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3572                                                MachineIRBuilder &B,
3573                                                Intrinsic::ID IID) const {
3574   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3575                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3576 
3577   Register Dst = MI.getOperand(0).getReg();
3578   Register VData = MI.getOperand(2).getReg();
3579 
3580   Register CmpVal;
3581   int OpOffset = 0;
3582 
3583   if (IsCmpSwap) {
3584     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3585     ++OpOffset;
3586   }
3587 
3588   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3589   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3590 
3591   // The struct intrinsic variants add one additional operand over raw.
3592   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3593   Register VIndex;
3594   if (HasVIndex) {
3595     VIndex = MI.getOperand(4 + OpOffset).getReg();
3596     ++OpOffset;
3597   }
3598 
3599   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3600   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3601   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3602 
3603   MachineMemOperand *MMO = *MI.memoperands_begin();
3604 
3605   unsigned ImmOffset;
3606   unsigned TotalOffset;
3607   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3608   if (TotalOffset != 0)
3609     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3610 
3611   if (!VIndex)
3612     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3613 
3614   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3615     .addDef(Dst)
3616     .addUse(VData); // vdata
3617 
3618   if (IsCmpSwap)
3619     MIB.addReg(CmpVal);
3620 
3621   MIB.addUse(RSrc)               // rsrc
3622      .addUse(VIndex)             // vindex
3623      .addUse(VOffset)            // voffset
3624      .addUse(SOffset)            // soffset
3625      .addImm(ImmOffset)          // offset(imm)
3626      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3627      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3628      .addMemOperand(MMO);
3629 
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
3634 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3635 /// vector with s16 typed elements.
3636 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3637                                         SmallVectorImpl<Register> &PackedAddrs,
3638                                         int AddrIdx, int DimIdx, int EndIdx,
3639                                         int NumGradients) {
3640   const LLT S16 = LLT::scalar(16);
3641   const LLT V2S16 = LLT::vector(2, 16);
3642 
3643   for (int I = AddrIdx; I < EndIdx; ++I) {
3644     MachineOperand &SrcOp = MI.getOperand(I);
3645     if (!SrcOp.isReg())
3646       continue; // _L to _LZ may have eliminated this.
3647 
3648     Register AddrReg = SrcOp.getReg();
3649 
3650     if (I < DimIdx) {
3651       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3652       PackedAddrs.push_back(AddrReg);
3653     } else {
3654       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3655       // derivatives dx/dh and dx/dv are packed with undef.
3656       if (((I + 1) >= EndIdx) ||
3657           ((NumGradients / 2) % 2 == 1 &&
3658            (I == DimIdx + (NumGradients / 2) - 1 ||
3659             I == DimIdx + NumGradients - 1)) ||
3660           // Check for _L to _LZ optimization
3661           !MI.getOperand(I + 1).isReg()) {
3662         PackedAddrs.push_back(
3663             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3664                 .getReg(0));
3665       } else {
3666         PackedAddrs.push_back(
3667             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3668                 .getReg(0));
3669         ++I;
3670       }
3671     }
3672   }
3673 }
3674 
3675 /// Convert from separate vaddr components to a single vector address register,
3676 /// and replace the remaining operands with $noreg.
3677 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3678                                      int DimIdx, int NumVAddrs) {
3679   const LLT S32 = LLT::scalar(32);
3680 
3681   SmallVector<Register, 8> AddrRegs;
3682   for (int I = 0; I != NumVAddrs; ++I) {
3683     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3684     if (SrcOp.isReg()) {
3685       AddrRegs.push_back(SrcOp.getReg());
3686       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3687     }
3688   }
3689 
3690   int NumAddrRegs = AddrRegs.size();
3691   if (NumAddrRegs != 1) {
3692     // Round up to 8 elements for v5-v7
3693     // FIXME: Missing intermediate sized register classes and instructions.
3694     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3695       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3696       auto Undef = B.buildUndef(S32);
3697       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3698       NumAddrRegs = RoundedNumRegs;
3699     }
3700 
3701     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3702     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3703   }
3704 
3705   for (int I = 1; I != NumVAddrs; ++I) {
3706     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3707     if (SrcOp.isReg())
3708       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3709   }
3710 }
3711 
3712 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3713 ///
3714 /// Depending on the subtarget, load/store with 16-bit element data need to be
3715 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3716 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3717 /// registers.
3718 ///
3719 /// We don't want to directly select image instructions just yet, but also want
3720 /// to exposes all register repacking to the legalizer/combiners. We also don't
3721 /// want a selected instrution entering RegBankSelect. In order to avoid
3722 /// defining a multitude of intermediate image instructions, directly hack on
3723 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3724 /// now unnecessary arguments with $noreg.
3725 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3726     MachineInstr &MI, MachineIRBuilder &B,
3727     GISelChangeObserver &Observer,
3728     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3729 
3730   const int NumDefs = MI.getNumExplicitDefs();
3731   bool IsTFE = NumDefs == 2;
3732   // We are only processing the operands of d16 image operations on subtargets
3733   // that use the unpacked register layout, or need to repack the TFE result.
3734 
3735   // TODO: Do we need to guard against already legalized intrinsics?
3736   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3737     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3738 
3739   MachineRegisterInfo *MRI = B.getMRI();
3740   const LLT S32 = LLT::scalar(32);
3741   const LLT S16 = LLT::scalar(16);
3742   const LLT V2S16 = LLT::vector(2, 16);
3743 
3744   // Index of first address argument
3745   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3746 
3747   int NumVAddrs, NumGradients;
3748   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3749   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3750     getDMaskIdx(BaseOpcode, NumDefs);
3751   unsigned DMask = 0;
3752 
3753   // Check for 16 bit addresses and pack if true.
3754   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3755   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3756   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3757   const bool IsG16 = GradTy == S16;
3758   const bool IsA16 = AddrTy == S16;
3759 
3760   int DMaskLanes = 0;
3761   if (!BaseOpcode->Atomic) {
3762     DMask = MI.getOperand(DMaskIdx).getImm();
3763     if (BaseOpcode->Gather4) {
3764       DMaskLanes = 4;
3765     } else if (DMask != 0) {
3766       DMaskLanes = countPopulation(DMask);
3767     } else if (!IsTFE && !BaseOpcode->Store) {
3768       // If dmask is 0, this is a no-op load. This can be eliminated.
3769       B.buildUndef(MI.getOperand(0));
3770       MI.eraseFromParent();
3771       return true;
3772     }
3773   }
3774 
3775   Observer.changingInstr(MI);
3776   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3777 
3778   unsigned NewOpcode = NumDefs == 0 ?
3779     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3780 
3781   // Track that we legalized this
3782   MI.setDesc(B.getTII().get(NewOpcode));
3783 
3784   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3785   // dmask to be at least 1 otherwise the instruction will fail
3786   if (IsTFE && DMask == 0) {
3787     DMask = 0x1;
3788     DMaskLanes = 1;
3789     MI.getOperand(DMaskIdx).setImm(DMask);
3790   }
3791 
3792   if (BaseOpcode->Atomic) {
3793     Register VData0 = MI.getOperand(2).getReg();
3794     LLT Ty = MRI->getType(VData0);
3795 
3796     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3797     if (Ty.isVector())
3798       return false;
3799 
3800     if (BaseOpcode->AtomicX2) {
3801       Register VData1 = MI.getOperand(3).getReg();
3802       // The two values are packed in one register.
3803       LLT PackedTy = LLT::vector(2, Ty);
3804       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3805       MI.getOperand(2).setReg(Concat.getReg(0));
3806       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3807     }
3808   }
3809 
3810   int CorrectedNumVAddrs = NumVAddrs;
3811 
3812   // Optimize _L to _LZ when _L is zero
3813   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3814         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3815     const ConstantFP *ConstantLod;
3816     const int LodIdx = AddrIdx + NumVAddrs - 1;
3817 
3818     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3819       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3820         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3821         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3822           LZMappingInfo->LZ, ImageDimIntr->Dim);
3823 
3824         // The starting indexes should remain in the same place.
3825         --NumVAddrs;
3826         --CorrectedNumVAddrs;
3827 
3828         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3829           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3830         MI.RemoveOperand(LodIdx);
3831       }
3832     }
3833   }
3834 
3835   // Optimize _mip away, when 'lod' is zero
3836   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3837     int64_t ConstantLod;
3838     const int LodIdx = AddrIdx + NumVAddrs - 1;
3839 
3840     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3841       if (ConstantLod == 0) {
3842         // TODO: Change intrinsic opcode and remove operand instead or replacing
3843         // it with 0, as the _L to _LZ handling is done above.
3844         MI.getOperand(LodIdx).ChangeToImmediate(0);
3845         --CorrectedNumVAddrs;
3846       }
3847     }
3848   }
3849 
3850   // Rewrite the addressing register layout before doing anything else.
3851   if (IsA16 || IsG16) {
3852     if (IsA16) {
3853       // Target must support the feature and gradients need to be 16 bit too
3854       if (!ST.hasA16() || !IsG16)
3855         return false;
3856     } else if (!ST.hasG16())
3857       return false;
3858 
3859     if (NumVAddrs > 1) {
3860       SmallVector<Register, 4> PackedRegs;
3861       // Don't compress addresses for G16
3862       const int PackEndIdx =
3863           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3864       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3865                                   PackEndIdx, NumGradients);
3866 
3867       if (!IsA16) {
3868         // Add uncompressed address
3869         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3870           int AddrReg = MI.getOperand(I).getReg();
3871           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3872           PackedRegs.push_back(AddrReg);
3873         }
3874       }
3875 
3876       // See also below in the non-a16 branch
3877       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3878 
3879       if (!UseNSA && PackedRegs.size() > 1) {
3880         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3881         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3882         PackedRegs[0] = Concat.getReg(0);
3883         PackedRegs.resize(1);
3884       }
3885 
3886       const int NumPacked = PackedRegs.size();
3887       for (int I = 0; I != NumVAddrs; ++I) {
3888         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3889         if (!SrcOp.isReg()) {
3890           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3891           continue;
3892         }
3893 
3894         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3895 
3896         if (I < NumPacked)
3897           SrcOp.setReg(PackedRegs[I]);
3898         else
3899           SrcOp.setReg(AMDGPU::NoRegister);
3900       }
3901     }
3902   } else {
3903     // If the register allocator cannot place the address registers contiguously
3904     // without introducing moves, then using the non-sequential address encoding
3905     // is always preferable, since it saves VALU instructions and is usually a
3906     // wash in terms of code size or even better.
3907     //
3908     // However, we currently have no way of hinting to the register allocator
3909     // that MIMG addresses should be placed contiguously when it is possible to
3910     // do so, so force non-NSA for the common 2-address case as a heuristic.
3911     //
3912     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3913     // allocation when possible.
3914     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3915 
3916     if (!UseNSA && NumVAddrs > 1)
3917       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3918   }
3919 
3920   int Flags = 0;
3921   if (IsA16)
3922     Flags |= 1;
3923   if (IsG16)
3924     Flags |= 2;
3925   MI.addOperand(MachineOperand::CreateImm(Flags));
3926 
3927   if (BaseOpcode->Store) { // No TFE for stores?
3928     // TODO: Handle dmask trim
3929     Register VData = MI.getOperand(1).getReg();
3930     LLT Ty = MRI->getType(VData);
3931     if (!Ty.isVector() || Ty.getElementType() != S16)
3932       return true;
3933 
3934     Register RepackedReg = handleD16VData(B, *MRI, VData);
3935     if (RepackedReg != VData) {
3936       MI.getOperand(1).setReg(RepackedReg);
3937     }
3938 
3939     return true;
3940   }
3941 
3942   Register DstReg = MI.getOperand(0).getReg();
3943   LLT Ty = MRI->getType(DstReg);
3944   const LLT EltTy = Ty.getScalarType();
3945   const bool IsD16 = Ty.getScalarType() == S16;
3946   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3947 
3948   // Confirm that the return type is large enough for the dmask specified
3949   if (NumElts < DMaskLanes)
3950     return false;
3951 
3952   if (NumElts > 4 || DMaskLanes > 4)
3953     return false;
3954 
3955   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3956   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3957 
3958   // The raw dword aligned data component of the load. The only legal cases
3959   // where this matters should be when using the packed D16 format, for
3960   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3961   LLT RoundedTy;
3962 
3963   // S32 vector to to cover all data, plus TFE result element.
3964   LLT TFETy;
3965 
3966   // Register type to use for each loaded component. Will be S32 or V2S16.
3967   LLT RegTy;
3968 
3969   if (IsD16 && ST.hasUnpackedD16VMem()) {
3970     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3971     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3972     RegTy = S32;
3973   } else {
3974     unsigned EltSize = EltTy.getSizeInBits();
3975     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3976     unsigned RoundedSize = 32 * RoundedElts;
3977     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3978     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3979     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3980   }
3981 
3982   // The return type does not need adjustment.
3983   // TODO: Should we change s16 case to s32 or <2 x s16>?
3984   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3985     return true;
3986 
3987   Register Dst1Reg;
3988 
3989   // Insert after the instruction.
3990   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3991 
3992   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3993   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3994   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3995   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3996 
3997   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3998 
3999   MI.getOperand(0).setReg(NewResultReg);
4000 
4001   // In the IR, TFE is supposed to be used with a 2 element struct return
4002   // type. The intruction really returns these two values in one contiguous
4003   // register, with one additional dword beyond the loaded data. Rewrite the
4004   // return type to use a single register result.
4005 
4006   if (IsTFE) {
4007     Dst1Reg = MI.getOperand(1).getReg();
4008     if (MRI->getType(Dst1Reg) != S32)
4009       return false;
4010 
4011     // TODO: Make sure the TFE operand bit is set.
4012     MI.RemoveOperand(1);
4013 
4014     // Handle the easy case that requires no repack instructions.
4015     if (Ty == S32) {
4016       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4017       return true;
4018     }
4019   }
4020 
4021   // Now figure out how to copy the new result register back into the old
4022   // result.
4023   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4024 
4025   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4026 
4027   if (ResultNumRegs == 1) {
4028     assert(!IsTFE);
4029     ResultRegs[0] = NewResultReg;
4030   } else {
4031     // We have to repack into a new vector of some kind.
4032     for (int I = 0; I != NumDataRegs; ++I)
4033       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4034     B.buildUnmerge(ResultRegs, NewResultReg);
4035 
4036     // Drop the final TFE element to get the data part. The TFE result is
4037     // directly written to the right place already.
4038     if (IsTFE)
4039       ResultRegs.resize(NumDataRegs);
4040   }
4041 
4042   // For an s16 scalar result, we form an s32 result with a truncate regardless
4043   // of packed vs. unpacked.
4044   if (IsD16 && !Ty.isVector()) {
4045     B.buildTrunc(DstReg, ResultRegs[0]);
4046     return true;
4047   }
4048 
4049   // Avoid a build/concat_vector of 1 entry.
4050   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4051     B.buildBitcast(DstReg, ResultRegs[0]);
4052     return true;
4053   }
4054 
4055   assert(Ty.isVector());
4056 
4057   if (IsD16) {
4058     // For packed D16 results with TFE enabled, all the data components are
4059     // S32. Cast back to the expected type.
4060     //
4061     // TODO: We don't really need to use load s32 elements. We would only need one
4062     // cast for the TFE result if a multiple of v2s16 was used.
4063     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4064       for (Register &Reg : ResultRegs)
4065         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4066     } else if (ST.hasUnpackedD16VMem()) {
4067       for (Register &Reg : ResultRegs)
4068         Reg = B.buildTrunc(S16, Reg).getReg(0);
4069     }
4070   }
4071 
4072   auto padWithUndef = [&](LLT Ty, int NumElts) {
4073     if (NumElts == 0)
4074       return;
4075     Register Undef = B.buildUndef(Ty).getReg(0);
4076     for (int I = 0; I != NumElts; ++I)
4077       ResultRegs.push_back(Undef);
4078   };
4079 
4080   // Pad out any elements eliminated due to the dmask.
4081   LLT ResTy = MRI->getType(ResultRegs[0]);
4082   if (!ResTy.isVector()) {
4083     padWithUndef(ResTy, NumElts - ResultRegs.size());
4084     B.buildBuildVector(DstReg, ResultRegs);
4085     return true;
4086   }
4087 
4088   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4089   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4090 
4091   // Deal with the one annoying legal case.
4092   const LLT V3S16 = LLT::vector(3, 16);
4093   if (Ty == V3S16) {
4094     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4095     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4096     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4097     return true;
4098   }
4099 
4100   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4101   B.buildConcatVectors(DstReg, ResultRegs);
4102   return true;
4103 }
4104 
4105 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4106   MachineInstr &MI, MachineIRBuilder &B,
4107   GISelChangeObserver &Observer) const {
4108   Register Dst = MI.getOperand(0).getReg();
4109   LLT Ty = B.getMRI()->getType(Dst);
4110   unsigned Size = Ty.getSizeInBits();
4111   MachineFunction &MF = B.getMF();
4112 
4113   Observer.changingInstr(MI);
4114 
4115   // FIXME: We don't really need this intermediate instruction. The intrinsic
4116   // should be fixed to have a memory operand. Since it's readnone, we're not
4117   // allowed to add one.
4118   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4119   MI.RemoveOperand(1); // Remove intrinsic ID
4120 
4121   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4122   // TODO: Should this use datalayout alignment?
4123   const unsigned MemSize = (Size + 7) / 8;
4124   const Align MemAlign(4);
4125   MachineMemOperand *MMO = MF.getMachineMemOperand(
4126       MachinePointerInfo(),
4127       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4128           MachineMemOperand::MOInvariant,
4129       MemSize, MemAlign);
4130   MI.addMemOperand(MF, MMO);
4131 
4132   // There are no 96-bit result scalar loads, but widening to 128-bit should
4133   // always be legal. We may need to restore this to a 96-bit result if it turns
4134   // out this needs to be converted to a vector load during RegBankSelect.
4135   if (!isPowerOf2_32(Size)) {
4136     LegalizerHelper Helper(MF, *this, Observer, B);
4137 
4138     if (Ty.isVector())
4139       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4140     else
4141       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4142   }
4143 
4144   Observer.changedInstr(MI);
4145   return true;
4146 }
4147 
4148 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4149                                                 MachineRegisterInfo &MRI,
4150                                                 MachineIRBuilder &B) const {
4151   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4152   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4153       !ST.isTrapHandlerEnabled()) {
4154     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4155   } else {
4156     // Pass queue pointer to trap handler as input, and insert trap instruction
4157     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4158     const ArgDescriptor *Arg =
4159         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4160     if (!Arg)
4161       return false;
4162     MachineRegisterInfo &MRI = *B.getMRI();
4163     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4164     Register LiveIn = getLiveInRegister(
4165         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4166         /*InsertLiveInCopy=*/false);
4167     if (!loadInputValue(LiveIn, B, Arg))
4168       return false;
4169     B.buildCopy(SGPR01, LiveIn);
4170     B.buildInstr(AMDGPU::S_TRAP)
4171         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4172         .addReg(SGPR01, RegState::Implicit);
4173   }
4174 
4175   MI.eraseFromParent();
4176   return true;
4177 }
4178 
4179 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4180     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4181   // Is non-HSA path or trap-handler disabled? then, report a warning
4182   // accordingly
4183   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4184       !ST.isTrapHandlerEnabled()) {
4185     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4186                                      "debugtrap handler not supported",
4187                                      MI.getDebugLoc(), DS_Warning);
4188     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4189     Ctx.diagnose(NoTrap);
4190   } else {
4191     // Insert debug-trap instruction
4192     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4193   }
4194 
4195   MI.eraseFromParent();
4196   return true;
4197 }
4198 
4199 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4200                                             MachineInstr &MI) const {
4201   MachineIRBuilder &B = Helper.MIRBuilder;
4202   MachineRegisterInfo &MRI = *B.getMRI();
4203 
4204   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4205   auto IntrID = MI.getIntrinsicID();
4206   switch (IntrID) {
4207   case Intrinsic::amdgcn_if:
4208   case Intrinsic::amdgcn_else: {
4209     MachineInstr *Br = nullptr;
4210     MachineBasicBlock *UncondBrTarget = nullptr;
4211     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4212       const SIRegisterInfo *TRI
4213         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4214 
4215       Register Def = MI.getOperand(1).getReg();
4216       Register Use = MI.getOperand(3).getReg();
4217 
4218       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4219       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4220       if (IntrID == Intrinsic::amdgcn_if) {
4221         B.buildInstr(AMDGPU::SI_IF)
4222           .addDef(Def)
4223           .addUse(Use)
4224           .addMBB(UncondBrTarget);
4225       } else {
4226         B.buildInstr(AMDGPU::SI_ELSE)
4227           .addDef(Def)
4228           .addUse(Use)
4229           .addMBB(UncondBrTarget)
4230           .addImm(0);
4231       }
4232 
4233       if (Br) {
4234         Br->getOperand(0).setMBB(CondBrTarget);
4235       } else {
4236         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4237         // since we're swapping branch targets it needs to be reinserted.
4238         // FIXME: IRTranslator should probably not do this
4239         B.buildBr(*CondBrTarget);
4240       }
4241 
4242       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4243       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4244       MI.eraseFromParent();
4245       BrCond->eraseFromParent();
4246       return true;
4247     }
4248 
4249     return false;
4250   }
4251   case Intrinsic::amdgcn_loop: {
4252     MachineInstr *Br = nullptr;
4253     MachineBasicBlock *UncondBrTarget = nullptr;
4254     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4255       const SIRegisterInfo *TRI
4256         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4257 
4258       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4259       Register Reg = MI.getOperand(2).getReg();
4260 
4261       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4262       B.buildInstr(AMDGPU::SI_LOOP)
4263         .addUse(Reg)
4264         .addMBB(UncondBrTarget);
4265 
4266       if (Br)
4267         Br->getOperand(0).setMBB(CondBrTarget);
4268       else
4269         B.buildBr(*CondBrTarget);
4270 
4271       MI.eraseFromParent();
4272       BrCond->eraseFromParent();
4273       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4274       return true;
4275     }
4276 
4277     return false;
4278   }
4279   case Intrinsic::amdgcn_kernarg_segment_ptr:
4280     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4281       // This only makes sense to call in a kernel, so just lower to null.
4282       B.buildConstant(MI.getOperand(0).getReg(), 0);
4283       MI.eraseFromParent();
4284       return true;
4285     }
4286 
4287     return legalizePreloadedArgIntrin(
4288       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4289   case Intrinsic::amdgcn_implicitarg_ptr:
4290     return legalizeImplicitArgPtr(MI, MRI, B);
4291   case Intrinsic::amdgcn_workitem_id_x:
4292     return legalizePreloadedArgIntrin(MI, MRI, B,
4293                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4294   case Intrinsic::amdgcn_workitem_id_y:
4295     return legalizePreloadedArgIntrin(MI, MRI, B,
4296                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4297   case Intrinsic::amdgcn_workitem_id_z:
4298     return legalizePreloadedArgIntrin(MI, MRI, B,
4299                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4300   case Intrinsic::amdgcn_workgroup_id_x:
4301     return legalizePreloadedArgIntrin(MI, MRI, B,
4302                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4303   case Intrinsic::amdgcn_workgroup_id_y:
4304     return legalizePreloadedArgIntrin(MI, MRI, B,
4305                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4306   case Intrinsic::amdgcn_workgroup_id_z:
4307     return legalizePreloadedArgIntrin(MI, MRI, B,
4308                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4309   case Intrinsic::amdgcn_dispatch_ptr:
4310     return legalizePreloadedArgIntrin(MI, MRI, B,
4311                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4312   case Intrinsic::amdgcn_queue_ptr:
4313     return legalizePreloadedArgIntrin(MI, MRI, B,
4314                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4315   case Intrinsic::amdgcn_implicit_buffer_ptr:
4316     return legalizePreloadedArgIntrin(
4317       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4318   case Intrinsic::amdgcn_dispatch_id:
4319     return legalizePreloadedArgIntrin(MI, MRI, B,
4320                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4321   case Intrinsic::amdgcn_fdiv_fast:
4322     return legalizeFDIVFastIntrin(MI, MRI, B);
4323   case Intrinsic::amdgcn_is_shared:
4324     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4325   case Intrinsic::amdgcn_is_private:
4326     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4327   case Intrinsic::amdgcn_wavefrontsize: {
4328     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4329     MI.eraseFromParent();
4330     return true;
4331   }
4332   case Intrinsic::amdgcn_s_buffer_load:
4333     return legalizeSBufferLoad(MI, B, Helper.Observer);
4334   case Intrinsic::amdgcn_raw_buffer_store:
4335   case Intrinsic::amdgcn_struct_buffer_store:
4336     return legalizeBufferStore(MI, MRI, B, false, false);
4337   case Intrinsic::amdgcn_raw_buffer_store_format:
4338   case Intrinsic::amdgcn_struct_buffer_store_format:
4339     return legalizeBufferStore(MI, MRI, B, false, true);
4340   case Intrinsic::amdgcn_raw_tbuffer_store:
4341   case Intrinsic::amdgcn_struct_tbuffer_store:
4342     return legalizeBufferStore(MI, MRI, B, true, true);
4343   case Intrinsic::amdgcn_raw_buffer_load:
4344   case Intrinsic::amdgcn_struct_buffer_load:
4345     return legalizeBufferLoad(MI, MRI, B, false, false);
4346   case Intrinsic::amdgcn_raw_buffer_load_format:
4347   case Intrinsic::amdgcn_struct_buffer_load_format:
4348     return legalizeBufferLoad(MI, MRI, B, true, false);
4349   case Intrinsic::amdgcn_raw_tbuffer_load:
4350   case Intrinsic::amdgcn_struct_tbuffer_load:
4351     return legalizeBufferLoad(MI, MRI, B, true, true);
4352   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4353   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4354   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4355   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4356   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4357   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4358   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4359   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4360   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4361   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4362   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4363   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4364   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4365   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4366   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4374   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4375   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4376   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4377   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4378     return legalizeBufferAtomic(MI, B, IntrID);
4379   case Intrinsic::amdgcn_atomic_inc:
4380     return legalizeAtomicIncDec(MI, B, true);
4381   case Intrinsic::amdgcn_atomic_dec:
4382     return legalizeAtomicIncDec(MI, B, false);
4383   case Intrinsic::trap:
4384     return legalizeTrapIntrinsic(MI, MRI, B);
4385   case Intrinsic::debugtrap:
4386     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4387   default: {
4388     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4389             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4390       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4391     return true;
4392   }
4393   }
4394 
4395   return true;
4396 }
4397