1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
445     .customFor({S32, S64})
446     .clampScalar(0, S32, S64)
447     .widenScalarToNextPow2(0, 32)
448     .scalarize(0);
449 
450   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
451     .legalFor({S32})
452     .clampScalar(0, S32, S32)
453     .scalarize(0);
454 
455   // Report legal for any types we can handle anywhere. For the cases only legal
456   // on the SALU, RegBankSelect will be able to re-legalize.
457   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
458     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
459     .clampScalar(0, S32, S64)
460     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
461     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
462     .widenScalarToNextPow2(0)
463     .scalarize(0);
464 
465   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
466                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
467     .legalFor({{S32, S1}, {S32, S32}})
468     .minScalar(0, S32)
469     // TODO: .scalarize(0)
470     .lower();
471 
472   getActionDefinitionsBuilder(G_BITCAST)
473     // Don't worry about the size constraint.
474     .legalIf(all(isRegisterType(0), isRegisterType(1)))
475     .lower();
476 
477 
478   getActionDefinitionsBuilder(G_CONSTANT)
479     .legalFor({S1, S32, S64, S16, GlobalPtr,
480                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
481     .clampScalar(0, S32, S64)
482     .widenScalarToNextPow2(0)
483     .legalIf(isPointer(0));
484 
485   getActionDefinitionsBuilder(G_FCONSTANT)
486     .legalFor({S32, S64, S16})
487     .clampScalar(0, S16, S64);
488 
489   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
490       .legalIf(isRegisterType(0))
491       // s1 and s16 are special cases because they have legal operations on
492       // them, but don't really occupy registers in the normal way.
493       .legalFor({S1, S16})
494       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
495       .clampScalarOrElt(0, S32, MaxScalar)
496       .widenScalarToNextPow2(0, 32)
497       .clampMaxNumElements(0, S32, 16);
498 
499   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
500 
501   // If the amount is divergent, we have to do a wave reduction to get the
502   // maximum value, so this is expanded during RegBankSelect.
503   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
504     .legalFor({{PrivatePtr, S32}});
505 
506   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
507     .unsupportedFor({PrivatePtr})
508     .custom();
509   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
510 
511   auto &FPOpActions = getActionDefinitionsBuilder(
512     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
513     .legalFor({S32, S64});
514   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
515     .customFor({S32, S64});
516   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
517     .customFor({S32, S64});
518 
519   if (ST.has16BitInsts()) {
520     if (ST.hasVOP3PInsts())
521       FPOpActions.legalFor({S16, V2S16});
522     else
523       FPOpActions.legalFor({S16});
524 
525     TrigActions.customFor({S16});
526     FDIVActions.customFor({S16});
527   }
528 
529   auto &MinNumMaxNum = getActionDefinitionsBuilder({
530       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
531 
532   if (ST.hasVOP3PInsts()) {
533     MinNumMaxNum.customFor(FPTypesPK16)
534       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
535       .clampMaxNumElements(0, S16, 2)
536       .clampScalar(0, S16, S64)
537       .scalarize(0);
538   } else if (ST.has16BitInsts()) {
539     MinNumMaxNum.customFor(FPTypes16)
540       .clampScalar(0, S16, S64)
541       .scalarize(0);
542   } else {
543     MinNumMaxNum.customFor(FPTypesBase)
544       .clampScalar(0, S32, S64)
545       .scalarize(0);
546   }
547 
548   if (ST.hasVOP3PInsts())
549     FPOpActions.clampMaxNumElements(0, S16, 2);
550 
551   FPOpActions
552     .scalarize(0)
553     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
554 
555   TrigActions
556     .scalarize(0)
557     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
558 
559   FDIVActions
560     .scalarize(0)
561     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
562 
563   getActionDefinitionsBuilder({G_FNEG, G_FABS})
564     .legalFor(FPTypesPK16)
565     .clampMaxNumElements(0, S16, 2)
566     .scalarize(0)
567     .clampScalar(0, S16, S64);
568 
569   if (ST.has16BitInsts()) {
570     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
571       .legalFor({S32, S64, S16})
572       .scalarize(0)
573       .clampScalar(0, S16, S64);
574   } else {
575     getActionDefinitionsBuilder(G_FSQRT)
576       .legalFor({S32, S64})
577       .scalarize(0)
578       .clampScalar(0, S32, S64);
579 
580     if (ST.hasFractBug()) {
581       getActionDefinitionsBuilder(G_FFLOOR)
582         .customFor({S64})
583         .legalFor({S32, S64})
584         .scalarize(0)
585         .clampScalar(0, S32, S64);
586     } else {
587       getActionDefinitionsBuilder(G_FFLOOR)
588         .legalFor({S32, S64})
589         .scalarize(0)
590         .clampScalar(0, S32, S64);
591     }
592   }
593 
594   getActionDefinitionsBuilder(G_FPTRUNC)
595     .legalFor({{S32, S64}, {S16, S32}})
596     .scalarize(0)
597     .lower();
598 
599   getActionDefinitionsBuilder(G_FPEXT)
600     .legalFor({{S64, S32}, {S32, S16}})
601     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
602     .scalarize(0);
603 
604   getActionDefinitionsBuilder(G_FSUB)
605       // Use actual fsub instruction
606       .legalFor({S32})
607       // Must use fadd + fneg
608       .lowerFor({S64, S16, V2S16})
609       .scalarize(0)
610       .clampScalar(0, S32, S64);
611 
612   // Whether this is legal depends on the floating point mode for the function.
613   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
614   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
615     FMad.customFor({S32, S16});
616   else if (ST.hasMadMacF32Insts())
617     FMad.customFor({S32});
618   else if (ST.hasMadF16())
619     FMad.customFor({S16});
620   FMad.scalarize(0)
621       .lower();
622 
623   // TODO: Do we need to clamp maximum bitwidth?
624   getActionDefinitionsBuilder(G_TRUNC)
625     .legalIf(isScalar(0))
626     .legalFor({{V2S16, V2S32}})
627     .clampMaxNumElements(0, S16, 2)
628     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
629     // situations (like an invalid implicit use), we don't want to infinite loop
630     // in the legalizer.
631     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
632     .alwaysLegal();
633 
634   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
635     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
636                {S32, S1}, {S64, S1}, {S16, S1}})
637     .scalarize(0)
638     .clampScalar(0, S32, S64)
639     .widenScalarToNextPow2(1, 32);
640 
641   // TODO: Split s1->s64 during regbankselect for VALU.
642   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
643     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
644     .lowerFor({{S32, S64}})
645     .lowerIf(typeIs(1, S1))
646     .customFor({{S64, S64}});
647   if (ST.has16BitInsts())
648     IToFP.legalFor({{S16, S16}});
649   IToFP.clampScalar(1, S32, S64)
650        .minScalar(0, S32)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}})
657     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
658   if (ST.has16BitInsts())
659     FPToI.legalFor({{S16, S16}});
660   else
661     FPToI.minScalar(1, S32);
662 
663   FPToI.minScalar(0, S32)
664        .scalarize(0)
665        .lower();
666 
667   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
668     .scalarize(0)
669     .lower();
670 
671   if (ST.has16BitInsts()) {
672     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
673       .legalFor({S16, S32, S64})
674       .clampScalar(0, S16, S64)
675       .scalarize(0);
676   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
677     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
678       .legalFor({S32, S64})
679       .clampScalar(0, S32, S64)
680       .scalarize(0);
681   } else {
682     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
683       .legalFor({S32})
684       .customFor({S64})
685       .clampScalar(0, S32, S64)
686       .scalarize(0);
687   }
688 
689   // FIXME: Clamp offset operand.
690   getActionDefinitionsBuilder(G_PTR_ADD)
691     .legalIf(isPointer(0))
692     .scalarize(0);
693 
694   getActionDefinitionsBuilder(G_PTRMASK)
695     .legalIf(typeInSet(1, {S64, S32}))
696     .minScalar(1, S32)
697     .maxScalarIf(sizeIs(0, 32), 1, S32)
698     .maxScalarIf(sizeIs(0, 64), 1, S64)
699     .scalarize(0);
700 
701   auto &CmpBuilder =
702     getActionDefinitionsBuilder(G_ICMP)
703     // The compare output type differs based on the register bank of the output,
704     // so make both s1 and s32 legal.
705     //
706     // Scalar compares producing output in scc will be promoted to s32, as that
707     // is the allocatable register type that will be needed for the copy from
708     // scc. This will be promoted during RegBankSelect, and we assume something
709     // before that won't try to use s32 result types.
710     //
711     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
712     // bank.
713     .legalForCartesianProduct(
714       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
715     .legalForCartesianProduct(
716       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
717   if (ST.has16BitInsts()) {
718     CmpBuilder.legalFor({{S1, S16}});
719   }
720 
721   CmpBuilder
722     .widenScalarToNextPow2(1)
723     .clampScalar(1, S32, S64)
724     .scalarize(0)
725     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
726 
727   getActionDefinitionsBuilder(G_FCMP)
728     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
729     .widenScalarToNextPow2(1)
730     .clampScalar(1, S32, S64)
731     .scalarize(0);
732 
733   // FIXME: fpow has a selection pattern that should move to custom lowering.
734   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
735   if (ST.has16BitInsts())
736     Exp2Ops.legalFor({S32, S16});
737   else
738     Exp2Ops.legalFor({S32});
739   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
740   Exp2Ops.scalarize(0);
741 
742   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
743   if (ST.has16BitInsts())
744     ExpOps.customFor({{S32}, {S16}});
745   else
746     ExpOps.customFor({S32});
747   ExpOps.clampScalar(0, MinScalarFPTy, S32)
748         .scalarize(0);
749 
750   getActionDefinitionsBuilder(G_FPOWI)
751     .clampScalar(0, MinScalarFPTy, S32)
752     .lower();
753 
754   // The 64-bit versions produce 32-bit results, but only on the SALU.
755   getActionDefinitionsBuilder(G_CTPOP)
756     .legalFor({{S32, S32}, {S32, S64}})
757     .clampScalar(0, S32, S32)
758     .clampScalar(1, S32, S64)
759     .scalarize(0)
760     .widenScalarToNextPow2(0, 32)
761     .widenScalarToNextPow2(1, 32);
762 
763   // The hardware instructions return a different result on 0 than the generic
764   // instructions expect. The hardware produces -1, but these produce the
765   // bitwidth.
766   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
767     .scalarize(0)
768     .clampScalar(0, S32, S32)
769     .clampScalar(1, S32, S64)
770     .widenScalarToNextPow2(0, 32)
771     .widenScalarToNextPow2(1, 32)
772     .lower();
773 
774   // The 64-bit versions produce 32-bit results, but only on the SALU.
775   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
776     .legalFor({{S32, S32}, {S32, S64}})
777     .clampScalar(0, S32, S32)
778     .clampScalar(1, S32, S64)
779     .scalarize(0)
780     .widenScalarToNextPow2(0, 32)
781     .widenScalarToNextPow2(1, 32);
782 
783   getActionDefinitionsBuilder(G_BITREVERSE)
784     .legalFor({S32})
785     .clampScalar(0, S32, S32)
786     .scalarize(0);
787 
788   if (ST.has16BitInsts()) {
789     getActionDefinitionsBuilder(G_BSWAP)
790       .legalFor({S16, S32, V2S16})
791       .clampMaxNumElements(0, S16, 2)
792       // FIXME: Fixing non-power-of-2 before clamp is workaround for
793       // narrowScalar limitation.
794       .widenScalarToNextPow2(0)
795       .clampScalar(0, S16, S32)
796       .scalarize(0);
797 
798     if (ST.hasVOP3PInsts()) {
799       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
800         .legalFor({S32, S16, V2S16})
801         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
802         .clampMaxNumElements(0, S16, 2)
803         .minScalar(0, S16)
804         .widenScalarToNextPow2(0)
805         .scalarize(0)
806         .lower();
807     } else {
808       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
809         .legalFor({S32, S16})
810         .widenScalarToNextPow2(0)
811         .minScalar(0, S16)
812         .scalarize(0)
813         .lower();
814     }
815   } else {
816     // TODO: Should have same legality without v_perm_b32
817     getActionDefinitionsBuilder(G_BSWAP)
818       .legalFor({S32})
819       .lowerIf(scalarNarrowerThan(0, 32))
820       // FIXME: Fixing non-power-of-2 before clamp is workaround for
821       // narrowScalar limitation.
822       .widenScalarToNextPow2(0)
823       .maxScalar(0, S32)
824       .scalarize(0)
825       .lower();
826 
827     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
828       .legalFor({S32})
829       .minScalar(0, S32)
830       .widenScalarToNextPow2(0)
831       .scalarize(0)
832       .lower();
833   }
834 
835   getActionDefinitionsBuilder(G_INTTOPTR)
836     // List the common cases
837     .legalForCartesianProduct(AddrSpaces64, {S64})
838     .legalForCartesianProduct(AddrSpaces32, {S32})
839     .scalarize(0)
840     // Accept any address space as long as the size matches
841     .legalIf(sameSize(0, 1))
842     .widenScalarIf(smallerThan(1, 0),
843       [](const LegalityQuery &Query) {
844         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
845       })
846     .narrowScalarIf(largerThan(1, 0),
847       [](const LegalityQuery &Query) {
848         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
849       });
850 
851   getActionDefinitionsBuilder(G_PTRTOINT)
852     // List the common cases
853     .legalForCartesianProduct(AddrSpaces64, {S64})
854     .legalForCartesianProduct(AddrSpaces32, {S32})
855     .scalarize(0)
856     // Accept any address space as long as the size matches
857     .legalIf(sameSize(0, 1))
858     .widenScalarIf(smallerThan(0, 1),
859       [](const LegalityQuery &Query) {
860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861       })
862     .narrowScalarIf(
863       largerThan(0, 1),
864       [](const LegalityQuery &Query) {
865         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
866       });
867 
868   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
869     .scalarize(0)
870     .custom();
871 
872   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
873                                     bool IsLoad) -> bool {
874     const LLT DstTy = Query.Types[0];
875 
876     // Split vector extloads.
877     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
878     unsigned Align = Query.MMODescrs[0].AlignInBits;
879 
880     if (MemSize < DstTy.getSizeInBits())
881       MemSize = std::max(MemSize, Align);
882 
883     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
884       return true;
885 
886     const LLT PtrTy = Query.Types[1];
887     unsigned AS = PtrTy.getAddressSpace();
888     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
889       return true;
890 
891     // Catch weird sized loads that don't evenly divide into the access sizes
892     // TODO: May be able to widen depending on alignment etc.
893     unsigned NumRegs = (MemSize + 31) / 32;
894     if (NumRegs == 3) {
895       if (!ST.hasDwordx3LoadStores())
896         return true;
897     } else {
898       // If the alignment allows, these should have been widened.
899       if (!isPowerOf2_32(NumRegs))
900         return true;
901     }
902 
903     if (Align < MemSize) {
904       const SITargetLowering *TLI = ST.getTargetLowering();
905       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
906     }
907 
908     return false;
909   };
910 
911   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
912                                          unsigned Opc) -> bool {
913     unsigned Size = Query.Types[0].getSizeInBits();
914     if (isPowerOf2_32(Size))
915       return false;
916 
917     if (Size == 96 && ST.hasDwordx3LoadStores())
918       return false;
919 
920     unsigned AddrSpace = Query.Types[1].getAddressSpace();
921     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
922       return false;
923 
924     unsigned Align = Query.MMODescrs[0].AlignInBits;
925     unsigned RoundedSize = NextPowerOf2(Size);
926     return (Align >= RoundedSize);
927   };
928 
929   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
930   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
931   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
932 
933   // TODO: Refine based on subtargets which support unaligned access or 128-bit
934   // LDS
935   // TODO: Unsupported flat for SI.
936 
937   for (unsigned Op : {G_LOAD, G_STORE}) {
938     const bool IsStore = Op == G_STORE;
939 
940     auto &Actions = getActionDefinitionsBuilder(Op);
941     // Explicitly list some common cases.
942     // TODO: Does this help compile time at all?
943     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
944                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
945                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
946                                       {S64, GlobalPtr, 64, GlobalAlign32},
947                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
948                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
949                                       {S32, GlobalPtr, 8, GlobalAlign8},
950                                       {S32, GlobalPtr, 16, GlobalAlign16},
951 
952                                       {S32, LocalPtr, 32, 32},
953                                       {S64, LocalPtr, 64, 32},
954                                       {V2S32, LocalPtr, 64, 32},
955                                       {S32, LocalPtr, 8, 8},
956                                       {S32, LocalPtr, 16, 16},
957                                       {V2S16, LocalPtr, 32, 32},
958 
959                                       {S32, PrivatePtr, 32, 32},
960                                       {S32, PrivatePtr, 8, 8},
961                                       {S32, PrivatePtr, 16, 16},
962                                       {V2S16, PrivatePtr, 32, 32},
963 
964                                       {S32, ConstantPtr, 32, GlobalAlign32},
965                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
966                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
967                                       {S64, ConstantPtr, 64, GlobalAlign32},
968                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
969     Actions.legalIf(
970       [=](const LegalityQuery &Query) -> bool {
971         return isLoadStoreLegal(ST, Query, Op);
972       });
973 
974     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
975     // 64-bits.
976     //
977     // TODO: Should generalize bitcast action into coerce, which will also cover
978     // inserting addrspacecasts.
979     Actions.customIf(typeIs(1, Constant32Ptr));
980 
981     // Turn any illegal element vectors into something easier to deal
982     // with. These will ultimately produce 32-bit scalar shifts to extract the
983     // parts anyway.
984     //
985     // For odd 16-bit element vectors, prefer to split those into pieces with
986     // 16-bit vector parts.
987     Actions.bitcastIf(
988       [=](const LegalityQuery &Query) -> bool {
989         const LLT Ty = Query.Types[0];
990         const unsigned Size = Ty.getSizeInBits();
991 
992         if (Size != Query.MMODescrs[0].SizeInBits)
993           return Size <= 32 && Ty.isVector();
994 
995         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
996           return true;
997         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
998                !isRegisterVectorElementType(Ty.getElementType());
999       }, bitcastToRegisterType(0));
1000 
1001     Actions
1002         .customIf(typeIs(1, Constant32Ptr))
1003         // Widen suitably aligned loads by loading extra elements.
1004         .moreElementsIf([=](const LegalityQuery &Query) {
1005             const LLT Ty = Query.Types[0];
1006             return Op == G_LOAD && Ty.isVector() &&
1007                    shouldWidenLoadResult(Query, Op);
1008           }, moreElementsToNextPow2(0))
1009         .widenScalarIf([=](const LegalityQuery &Query) {
1010             const LLT Ty = Query.Types[0];
1011             return Op == G_LOAD && !Ty.isVector() &&
1012                    shouldWidenLoadResult(Query, Op);
1013           }, widenScalarOrEltToNextPow2(0))
1014         .narrowScalarIf(
1015             [=](const LegalityQuery &Query) -> bool {
1016               return !Query.Types[0].isVector() &&
1017                      needToSplitMemOp(Query, Op == G_LOAD);
1018             },
1019             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1020               const LLT DstTy = Query.Types[0];
1021               const LLT PtrTy = Query.Types[1];
1022 
1023               const unsigned DstSize = DstTy.getSizeInBits();
1024               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1025 
1026               // Split extloads.
1027               if (DstSize > MemSize)
1028                 return std::make_pair(0, LLT::scalar(MemSize));
1029 
1030               if (!isPowerOf2_32(DstSize)) {
1031                 // We're probably decomposing an odd sized store. Try to split
1032                 // to the widest type. TODO: Account for alignment. As-is it
1033                 // should be OK, since the new parts will be further legalized.
1034                 unsigned FloorSize = PowerOf2Floor(DstSize);
1035                 return std::make_pair(0, LLT::scalar(FloorSize));
1036               }
1037 
1038               if (DstSize > 32 && (DstSize % 32 != 0)) {
1039                 // FIXME: Need a way to specify non-extload of larger size if
1040                 // suitably aligned.
1041                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1042               }
1043 
1044               unsigned MaxSize = maxSizeForAddrSpace(ST,
1045                                                      PtrTy.getAddressSpace(),
1046                                                      Op == G_LOAD);
1047               if (MemSize > MaxSize)
1048                 return std::make_pair(0, LLT::scalar(MaxSize));
1049 
1050               unsigned Align = Query.MMODescrs[0].AlignInBits;
1051               return std::make_pair(0, LLT::scalar(Align));
1052             })
1053         .fewerElementsIf(
1054             [=](const LegalityQuery &Query) -> bool {
1055               return Query.Types[0].isVector() &&
1056                      needToSplitMemOp(Query, Op == G_LOAD);
1057             },
1058             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1059               const LLT DstTy = Query.Types[0];
1060               const LLT PtrTy = Query.Types[1];
1061 
1062               LLT EltTy = DstTy.getElementType();
1063               unsigned MaxSize = maxSizeForAddrSpace(ST,
1064                                                      PtrTy.getAddressSpace(),
1065                                                      Op == G_LOAD);
1066 
1067               // FIXME: Handle widened to power of 2 results better. This ends
1068               // up scalarizing.
1069               // FIXME: 3 element stores scalarized on SI
1070 
1071               // Split if it's too large for the address space.
1072               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1073                 unsigned NumElts = DstTy.getNumElements();
1074                 unsigned EltSize = EltTy.getSizeInBits();
1075 
1076                 if (MaxSize % EltSize == 0) {
1077                   return std::make_pair(
1078                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1079                 }
1080 
1081                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1082 
1083                 // FIXME: Refine when odd breakdowns handled
1084                 // The scalars will need to be re-legalized.
1085                 if (NumPieces == 1 || NumPieces >= NumElts ||
1086                     NumElts % NumPieces != 0)
1087                   return std::make_pair(0, EltTy);
1088 
1089                 return std::make_pair(0,
1090                                       LLT::vector(NumElts / NumPieces, EltTy));
1091               }
1092 
1093               // FIXME: We could probably handle weird extending loads better.
1094               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1095               if (DstTy.getSizeInBits() > MemSize)
1096                 return std::make_pair(0, EltTy);
1097 
1098               unsigned EltSize = EltTy.getSizeInBits();
1099               unsigned DstSize = DstTy.getSizeInBits();
1100               if (!isPowerOf2_32(DstSize)) {
1101                 // We're probably decomposing an odd sized store. Try to split
1102                 // to the widest type. TODO: Account for alignment. As-is it
1103                 // should be OK, since the new parts will be further legalized.
1104                 unsigned FloorSize = PowerOf2Floor(DstSize);
1105                 return std::make_pair(
1106                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1107               }
1108 
1109               // Need to split because of alignment.
1110               unsigned Align = Query.MMODescrs[0].AlignInBits;
1111               if (EltSize > Align &&
1112                   (EltSize / Align < DstTy.getNumElements())) {
1113                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1114               }
1115 
1116               // May need relegalization for the scalars.
1117               return std::make_pair(0, EltTy);
1118             })
1119         .minScalar(0, S32);
1120 
1121     if (IsStore)
1122       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1123 
1124     // TODO: Need a bitcast lower option?
1125     Actions
1126         .widenScalarToNextPow2(0)
1127         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1128   }
1129 
1130   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1131                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1132                                                   {S32, GlobalPtr, 16, 2 * 8},
1133                                                   {S32, LocalPtr, 8, 8},
1134                                                   {S32, LocalPtr, 16, 16},
1135                                                   {S32, PrivatePtr, 8, 8},
1136                                                   {S32, PrivatePtr, 16, 16},
1137                                                   {S32, ConstantPtr, 8, 8},
1138                                                   {S32, ConstantPtr, 16, 2 * 8}});
1139   if (ST.hasFlatAddressSpace()) {
1140     ExtLoads.legalForTypesWithMemDesc(
1141         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1142   }
1143 
1144   ExtLoads.clampScalar(0, S32, S32)
1145           .widenScalarToNextPow2(0)
1146           .unsupportedIfMemSizeNotPow2()
1147           .lower();
1148 
1149   auto &Atomics = getActionDefinitionsBuilder(
1150     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1151      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1152      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1153      G_ATOMICRMW_UMIN})
1154     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1155                {S64, GlobalPtr}, {S64, LocalPtr}});
1156   if (ST.hasFlatAddressSpace()) {
1157     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1158   }
1159 
1160   if (ST.hasLDSFPAtomics()) {
1161     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1162       .legalFor({{S32, LocalPtr}});
1163   }
1164 
1165   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1166   // demarshalling
1167   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1168     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1169                 {S32, FlatPtr}, {S64, FlatPtr}})
1170     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1171                {S32, RegionPtr}, {S64, RegionPtr}});
1172   // TODO: Pointer types, any 32-bit or 64-bit vector
1173 
1174   // Condition should be s32 for scalar, s1 for vector.
1175   getActionDefinitionsBuilder(G_SELECT)
1176     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1177           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1178           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1179     .clampScalar(0, S16, S64)
1180     .scalarize(1)
1181     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1182     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1183     .clampMaxNumElements(0, S32, 2)
1184     .clampMaxNumElements(0, LocalPtr, 2)
1185     .clampMaxNumElements(0, PrivatePtr, 2)
1186     .scalarize(0)
1187     .widenScalarToNextPow2(0)
1188     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1189 
1190   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1191   // be more flexible with the shift amount type.
1192   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1193     .legalFor({{S32, S32}, {S64, S32}});
1194   if (ST.has16BitInsts()) {
1195     if (ST.hasVOP3PInsts()) {
1196       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1197             .clampMaxNumElements(0, S16, 2);
1198     } else
1199       Shifts.legalFor({{S16, S16}});
1200 
1201     // TODO: Support 16-bit shift amounts for all types
1202     Shifts.widenScalarIf(
1203       [=](const LegalityQuery &Query) {
1204         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1205         // 32-bit amount.
1206         const LLT ValTy = Query.Types[0];
1207         const LLT AmountTy = Query.Types[1];
1208         return ValTy.getSizeInBits() <= 16 &&
1209                AmountTy.getSizeInBits() < 16;
1210       }, changeTo(1, S16));
1211     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1212     Shifts.clampScalar(1, S32, S32);
1213     Shifts.clampScalar(0, S16, S64);
1214     Shifts.widenScalarToNextPow2(0, 16);
1215   } else {
1216     // Make sure we legalize the shift amount type first, as the general
1217     // expansion for the shifted type will produce much worse code if it hasn't
1218     // been truncated already.
1219     Shifts.clampScalar(1, S32, S32);
1220     Shifts.clampScalar(0, S32, S64);
1221     Shifts.widenScalarToNextPow2(0, 32);
1222   }
1223   Shifts.scalarize(0);
1224 
1225   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1226     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1227     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1228     unsigned IdxTypeIdx = 2;
1229 
1230     getActionDefinitionsBuilder(Op)
1231       .customIf([=](const LegalityQuery &Query) {
1232           const LLT EltTy = Query.Types[EltTypeIdx];
1233           const LLT VecTy = Query.Types[VecTypeIdx];
1234           const LLT IdxTy = Query.Types[IdxTypeIdx];
1235           return (EltTy.getSizeInBits() == 16 ||
1236                   EltTy.getSizeInBits() % 32 == 0) &&
1237                  VecTy.getSizeInBits() % 32 == 0 &&
1238                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1239                  IdxTy.getSizeInBits() == 32;
1240         })
1241       .clampScalar(EltTypeIdx, S32, S64)
1242       .clampScalar(VecTypeIdx, S32, S64)
1243       .clampScalar(IdxTypeIdx, S32, S32);
1244   }
1245 
1246   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1247     .unsupportedIf([=](const LegalityQuery &Query) {
1248         const LLT &EltTy = Query.Types[1].getElementType();
1249         return Query.Types[0] != EltTy;
1250       });
1251 
1252   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1253     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1254     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1255 
1256     // FIXME: Doesn't handle extract of illegal sizes.
1257     getActionDefinitionsBuilder(Op)
1258       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1259       // FIXME: Multiples of 16 should not be legal.
1260       .legalIf([=](const LegalityQuery &Query) {
1261           const LLT BigTy = Query.Types[BigTyIdx];
1262           const LLT LitTy = Query.Types[LitTyIdx];
1263           return (BigTy.getSizeInBits() % 32 == 0) &&
1264                  (LitTy.getSizeInBits() % 16 == 0);
1265         })
1266       .widenScalarIf(
1267         [=](const LegalityQuery &Query) {
1268           const LLT BigTy = Query.Types[BigTyIdx];
1269           return (BigTy.getScalarSizeInBits() < 16);
1270         },
1271         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1272       .widenScalarIf(
1273         [=](const LegalityQuery &Query) {
1274           const LLT LitTy = Query.Types[LitTyIdx];
1275           return (LitTy.getScalarSizeInBits() < 16);
1276         },
1277         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1278       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1279       .widenScalarToNextPow2(BigTyIdx, 32);
1280 
1281   }
1282 
1283   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1284     .legalForCartesianProduct(AllS32Vectors, {S32})
1285     .legalForCartesianProduct(AllS64Vectors, {S64})
1286     .clampNumElements(0, V16S32, V32S32)
1287     .clampNumElements(0, V2S64, V16S64)
1288     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1289 
1290   if (ST.hasScalarPackInsts()) {
1291     BuildVector
1292       // FIXME: Should probably widen s1 vectors straight to s32
1293       .minScalarOrElt(0, S16)
1294       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1295       .minScalar(1, S32);
1296 
1297     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1298       .legalFor({V2S16, S32})
1299       .lower();
1300     BuildVector.minScalarOrElt(0, S32);
1301   } else {
1302     BuildVector.customFor({V2S16, S16});
1303     BuildVector.minScalarOrElt(0, S32);
1304 
1305     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1306       .customFor({V2S16, S32})
1307       .lower();
1308   }
1309 
1310   BuildVector.legalIf(isRegisterType(0));
1311 
1312   // FIXME: Clamp maximum size
1313   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1314     .legalIf(isRegisterType(0));
1315 
1316   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1317   // pre-legalize.
1318   if (ST.hasVOP3PInsts()) {
1319     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1320       .customFor({V2S16, V2S16})
1321       .lower();
1322   } else
1323     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1324 
1325   // Merge/Unmerge
1326   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1327     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1328     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1329 
1330     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1331       const LLT Ty = Query.Types[TypeIdx];
1332       if (Ty.isVector()) {
1333         const LLT &EltTy = Ty.getElementType();
1334         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1335           return true;
1336         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1337           return true;
1338       }
1339       return false;
1340     };
1341 
1342     auto &Builder = getActionDefinitionsBuilder(Op)
1343       .lowerFor({{S16, V2S16}})
1344       .lowerIf([=](const LegalityQuery &Query) {
1345           const LLT BigTy = Query.Types[BigTyIdx];
1346           return BigTy.getSizeInBits() == 32;
1347         })
1348       // Try to widen to s16 first for small types.
1349       // TODO: Only do this on targets with legal s16 shifts
1350       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1351       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1352       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1353       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1354                            elementTypeIs(1, S16)),
1355                        changeTo(1, V2S16))
1356       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1357       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1358       // valid.
1359       .clampScalar(LitTyIdx, S32, S512)
1360       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1361       // Break up vectors with weird elements into scalars
1362       .fewerElementsIf(
1363         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1364         scalarize(0))
1365       .fewerElementsIf(
1366         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1367         scalarize(1))
1368       .clampScalar(BigTyIdx, S32, MaxScalar);
1369 
1370     if (Op == G_MERGE_VALUES) {
1371       Builder.widenScalarIf(
1372         // TODO: Use 16-bit shifts if legal for 8-bit values?
1373         [=](const LegalityQuery &Query) {
1374           const LLT Ty = Query.Types[LitTyIdx];
1375           return Ty.getSizeInBits() < 32;
1376         },
1377         changeTo(LitTyIdx, S32));
1378     }
1379 
1380     Builder.widenScalarIf(
1381       [=](const LegalityQuery &Query) {
1382         const LLT Ty = Query.Types[BigTyIdx];
1383         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1384           Ty.getSizeInBits() % 16 != 0;
1385       },
1386       [=](const LegalityQuery &Query) {
1387         // Pick the next power of 2, or a multiple of 64 over 128.
1388         // Whichever is smaller.
1389         const LLT &Ty = Query.Types[BigTyIdx];
1390         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1391         if (NewSizeInBits >= 256) {
1392           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1393           if (RoundedTo < NewSizeInBits)
1394             NewSizeInBits = RoundedTo;
1395         }
1396         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1397       })
1398       .legalIf([=](const LegalityQuery &Query) {
1399           const LLT &BigTy = Query.Types[BigTyIdx];
1400           const LLT &LitTy = Query.Types[LitTyIdx];
1401 
1402           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1403             return false;
1404           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1405             return false;
1406 
1407           return BigTy.getSizeInBits() % 16 == 0 &&
1408                  LitTy.getSizeInBits() % 16 == 0 &&
1409                  BigTy.getSizeInBits() <= MaxRegisterSize;
1410         })
1411       // Any vectors left are the wrong size. Scalarize them.
1412       .scalarize(0)
1413       .scalarize(1);
1414   }
1415 
1416   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1417   // RegBankSelect.
1418   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1419     .legalFor({{S32}, {S64}});
1420 
1421   if (ST.hasVOP3PInsts()) {
1422     SextInReg.lowerFor({{V2S16}})
1423       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1424       // get more vector shift opportunities, since we'll get those when
1425       // expanded.
1426       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1427   } else if (ST.has16BitInsts()) {
1428     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1429   } else {
1430     // Prefer to promote to s32 before lowering if we don't have 16-bit
1431     // shifts. This avoid a lot of intermediate truncate and extend operations.
1432     SextInReg.lowerFor({{S32}, {S64}});
1433   }
1434 
1435   // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1436   // available, and is selectively legal for s16, s32, v2s16.
1437   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1438     .scalarize(0)
1439     .clampScalar(0, S16, S32);
1440 
1441   SextInReg
1442     .scalarize(0)
1443     .clampScalar(0, S32, S64)
1444     .lower();
1445 
1446   getActionDefinitionsBuilder(G_FSHR)
1447     .legalFor({{S32, S32}})
1448     .scalarize(0)
1449     .lower();
1450 
1451   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1452     .legalFor({S64});
1453 
1454   getActionDefinitionsBuilder({
1455       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1456       G_FCOPYSIGN,
1457 
1458       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1459       G_READ_REGISTER,
1460       G_WRITE_REGISTER,
1461 
1462       G_SADDO, G_SSUBO,
1463 
1464        // TODO: Implement
1465       G_FMINIMUM, G_FMAXIMUM,
1466       G_FSHL
1467     }).lower();
1468 
1469   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1470         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1471         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1472     .unsupported();
1473 
1474   computeTables();
1475   verify(*ST.getInstrInfo());
1476 }
1477 
1478 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1479                                          MachineInstr &MI) const {
1480   MachineIRBuilder &B = Helper.MIRBuilder;
1481   MachineRegisterInfo &MRI = *B.getMRI();
1482   GISelChangeObserver &Observer = Helper.Observer;
1483 
1484   switch (MI.getOpcode()) {
1485   case TargetOpcode::G_ADDRSPACE_CAST:
1486     return legalizeAddrSpaceCast(MI, MRI, B);
1487   case TargetOpcode::G_FRINT:
1488     return legalizeFrint(MI, MRI, B);
1489   case TargetOpcode::G_FCEIL:
1490     return legalizeFceil(MI, MRI, B);
1491   case TargetOpcode::G_INTRINSIC_TRUNC:
1492     return legalizeIntrinsicTrunc(MI, MRI, B);
1493   case TargetOpcode::G_SITOFP:
1494     return legalizeITOFP(MI, MRI, B, true);
1495   case TargetOpcode::G_UITOFP:
1496     return legalizeITOFP(MI, MRI, B, false);
1497   case TargetOpcode::G_FPTOSI:
1498     return legalizeFPTOI(MI, MRI, B, true);
1499   case TargetOpcode::G_FPTOUI:
1500     return legalizeFPTOI(MI, MRI, B, false);
1501   case TargetOpcode::G_FMINNUM:
1502   case TargetOpcode::G_FMAXNUM:
1503   case TargetOpcode::G_FMINNUM_IEEE:
1504   case TargetOpcode::G_FMAXNUM_IEEE:
1505     return legalizeMinNumMaxNum(Helper, MI);
1506   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1507     return legalizeExtractVectorElt(MI, MRI, B);
1508   case TargetOpcode::G_INSERT_VECTOR_ELT:
1509     return legalizeInsertVectorElt(MI, MRI, B);
1510   case TargetOpcode::G_SHUFFLE_VECTOR:
1511     return legalizeShuffleVector(MI, MRI, B);
1512   case TargetOpcode::G_FSIN:
1513   case TargetOpcode::G_FCOS:
1514     return legalizeSinCos(MI, MRI, B);
1515   case TargetOpcode::G_GLOBAL_VALUE:
1516     return legalizeGlobalValue(MI, MRI, B);
1517   case TargetOpcode::G_LOAD:
1518     return legalizeLoad(MI, MRI, B, Observer);
1519   case TargetOpcode::G_FMAD:
1520     return legalizeFMad(MI, MRI, B);
1521   case TargetOpcode::G_FDIV:
1522     return legalizeFDIV(MI, MRI, B);
1523   case TargetOpcode::G_UDIV:
1524   case TargetOpcode::G_UREM:
1525     return legalizeUDIV_UREM(MI, MRI, B);
1526   case TargetOpcode::G_SDIV:
1527   case TargetOpcode::G_SREM:
1528     return legalizeSDIV_SREM(MI, MRI, B);
1529   case TargetOpcode::G_ATOMIC_CMPXCHG:
1530     return legalizeAtomicCmpXChg(MI, MRI, B);
1531   case TargetOpcode::G_FLOG:
1532     return legalizeFlog(MI, B, numbers::ln2f);
1533   case TargetOpcode::G_FLOG10:
1534     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1535   case TargetOpcode::G_FEXP:
1536     return legalizeFExp(MI, B);
1537   case TargetOpcode::G_FPOW:
1538     return legalizeFPow(MI, B);
1539   case TargetOpcode::G_FFLOOR:
1540     return legalizeFFloor(MI, MRI, B);
1541   case TargetOpcode::G_BUILD_VECTOR:
1542     return legalizeBuildVector(MI, MRI, B);
1543   default:
1544     return false;
1545   }
1546 
1547   llvm_unreachable("expected switch to return");
1548 }
1549 
1550 Register AMDGPULegalizerInfo::getSegmentAperture(
1551   unsigned AS,
1552   MachineRegisterInfo &MRI,
1553   MachineIRBuilder &B) const {
1554   MachineFunction &MF = B.getMF();
1555   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1556   const LLT S32 = LLT::scalar(32);
1557 
1558   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1559 
1560   if (ST.hasApertureRegs()) {
1561     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1562     // getreg.
1563     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1564         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1565         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1566     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1567         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1568         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1569     unsigned Encoding =
1570         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1571         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1572         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1573 
1574     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1575 
1576     B.buildInstr(AMDGPU::S_GETREG_B32)
1577       .addDef(GetReg)
1578       .addImm(Encoding);
1579     MRI.setType(GetReg, S32);
1580 
1581     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1582     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1583   }
1584 
1585   Register QueuePtr = MRI.createGenericVirtualRegister(
1586     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1587 
1588   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1589   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1590     return Register();
1591 
1592   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1593   // private_segment_aperture_base_hi.
1594   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1595 
1596   // TODO: can we be smarter about machine pointer info?
1597   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1598   MachineMemOperand *MMO = MF.getMachineMemOperand(
1599       PtrInfo,
1600       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1601           MachineMemOperand::MOInvariant,
1602       4, commonAlignment(Align(64), StructOffset));
1603 
1604   Register LoadAddr;
1605 
1606   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1607   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1608 }
1609 
1610 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1611   MachineInstr &MI, MachineRegisterInfo &MRI,
1612   MachineIRBuilder &B) const {
1613   MachineFunction &MF = B.getMF();
1614 
1615   const LLT S32 = LLT::scalar(32);
1616   Register Dst = MI.getOperand(0).getReg();
1617   Register Src = MI.getOperand(1).getReg();
1618 
1619   LLT DstTy = MRI.getType(Dst);
1620   LLT SrcTy = MRI.getType(Src);
1621   unsigned DestAS = DstTy.getAddressSpace();
1622   unsigned SrcAS = SrcTy.getAddressSpace();
1623 
1624   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1625   // vector element.
1626   assert(!DstTy.isVector());
1627 
1628   const AMDGPUTargetMachine &TM
1629     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1630 
1631   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1632   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1633     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1634     return true;
1635   }
1636 
1637   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1638     // Truncate.
1639     B.buildExtract(Dst, Src, 0);
1640     MI.eraseFromParent();
1641     return true;
1642   }
1643 
1644   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1645     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1646     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1647 
1648     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1649     // another. Merge operands are required to be the same type, but creating an
1650     // extra ptrtoint would be kind of pointless.
1651     auto HighAddr = B.buildConstant(
1652       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1653     B.buildMerge(Dst, {Src, HighAddr});
1654     MI.eraseFromParent();
1655     return true;
1656   }
1657 
1658   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1659     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1660            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1661     unsigned NullVal = TM.getNullPointerValue(DestAS);
1662 
1663     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1664     auto FlatNull = B.buildConstant(SrcTy, 0);
1665 
1666     // Extract low 32-bits of the pointer.
1667     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1668 
1669     auto CmpRes =
1670         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1671     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1672 
1673     MI.eraseFromParent();
1674     return true;
1675   }
1676 
1677   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1678     return false;
1679 
1680   if (!ST.hasFlatAddressSpace())
1681     return false;
1682 
1683   auto SegmentNull =
1684       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1685   auto FlatNull =
1686       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1687 
1688   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1689   if (!ApertureReg.isValid())
1690     return false;
1691 
1692   auto CmpRes =
1693       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1694 
1695   // Coerce the type of the low half of the result so we can use merge_values.
1696   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1697 
1698   // TODO: Should we allow mismatched types but matching sizes in merges to
1699   // avoid the ptrtoint?
1700   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1701   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1702 
1703   MI.eraseFromParent();
1704   return true;
1705 }
1706 
1707 bool AMDGPULegalizerInfo::legalizeFrint(
1708   MachineInstr &MI, MachineRegisterInfo &MRI,
1709   MachineIRBuilder &B) const {
1710   Register Src = MI.getOperand(1).getReg();
1711   LLT Ty = MRI.getType(Src);
1712   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1713 
1714   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1715   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1716 
1717   auto C1 = B.buildFConstant(Ty, C1Val);
1718   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1719 
1720   // TODO: Should this propagate fast-math-flags?
1721   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1722   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1723 
1724   auto C2 = B.buildFConstant(Ty, C2Val);
1725   auto Fabs = B.buildFAbs(Ty, Src);
1726 
1727   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1728   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1729   MI.eraseFromParent();
1730   return true;
1731 }
1732 
1733 bool AMDGPULegalizerInfo::legalizeFceil(
1734   MachineInstr &MI, MachineRegisterInfo &MRI,
1735   MachineIRBuilder &B) const {
1736 
1737   const LLT S1 = LLT::scalar(1);
1738   const LLT S64 = LLT::scalar(64);
1739 
1740   Register Src = MI.getOperand(1).getReg();
1741   assert(MRI.getType(Src) == S64);
1742 
1743   // result = trunc(src)
1744   // if (src > 0.0 && src != result)
1745   //   result += 1.0
1746 
1747   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1748 
1749   const auto Zero = B.buildFConstant(S64, 0.0);
1750   const auto One = B.buildFConstant(S64, 1.0);
1751   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1752   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1753   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1754   auto Add = B.buildSelect(S64, And, One, Zero);
1755 
1756   // TODO: Should this propagate fast-math-flags?
1757   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1758   return true;
1759 }
1760 
1761 static MachineInstrBuilder extractF64Exponent(Register Hi,
1762                                               MachineIRBuilder &B) {
1763   const unsigned FractBits = 52;
1764   const unsigned ExpBits = 11;
1765   LLT S32 = LLT::scalar(32);
1766 
1767   auto Const0 = B.buildConstant(S32, FractBits - 32);
1768   auto Const1 = B.buildConstant(S32, ExpBits);
1769 
1770   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1771     .addUse(Hi)
1772     .addUse(Const0.getReg(0))
1773     .addUse(Const1.getReg(0));
1774 
1775   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1776 }
1777 
1778 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1779   MachineInstr &MI, MachineRegisterInfo &MRI,
1780   MachineIRBuilder &B) const {
1781   const LLT S1 = LLT::scalar(1);
1782   const LLT S32 = LLT::scalar(32);
1783   const LLT S64 = LLT::scalar(64);
1784 
1785   Register Src = MI.getOperand(1).getReg();
1786   assert(MRI.getType(Src) == S64);
1787 
1788   // TODO: Should this use extract since the low half is unused?
1789   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1790   Register Hi = Unmerge.getReg(1);
1791 
1792   // Extract the upper half, since this is where we will find the sign and
1793   // exponent.
1794   auto Exp = extractF64Exponent(Hi, B);
1795 
1796   const unsigned FractBits = 52;
1797 
1798   // Extract the sign bit.
1799   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1800   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1801 
1802   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1803 
1804   const auto Zero32 = B.buildConstant(S32, 0);
1805 
1806   // Extend back to 64-bits.
1807   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1808 
1809   auto Shr = B.buildAShr(S64, FractMask, Exp);
1810   auto Not = B.buildNot(S64, Shr);
1811   auto Tmp0 = B.buildAnd(S64, Src, Not);
1812   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1813 
1814   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1815   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1816 
1817   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1818   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1819   MI.eraseFromParent();
1820   return true;
1821 }
1822 
1823 bool AMDGPULegalizerInfo::legalizeITOFP(
1824   MachineInstr &MI, MachineRegisterInfo &MRI,
1825   MachineIRBuilder &B, bool Signed) const {
1826 
1827   Register Dst = MI.getOperand(0).getReg();
1828   Register Src = MI.getOperand(1).getReg();
1829 
1830   const LLT S64 = LLT::scalar(64);
1831   const LLT S32 = LLT::scalar(32);
1832 
1833   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1834 
1835   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1836 
1837   auto CvtHi = Signed ?
1838     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1839     B.buildUITOFP(S64, Unmerge.getReg(1));
1840 
1841   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1842 
1843   auto ThirtyTwo = B.buildConstant(S32, 32);
1844   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1845     .addUse(CvtHi.getReg(0))
1846     .addUse(ThirtyTwo.getReg(0));
1847 
1848   // TODO: Should this propagate fast-math-flags?
1849   B.buildFAdd(Dst, LdExp, CvtLo);
1850   MI.eraseFromParent();
1851   return true;
1852 }
1853 
1854 // TODO: Copied from DAG implementation. Verify logic and document how this
1855 // actually works.
1856 bool AMDGPULegalizerInfo::legalizeFPTOI(
1857   MachineInstr &MI, MachineRegisterInfo &MRI,
1858   MachineIRBuilder &B, bool Signed) const {
1859 
1860   Register Dst = MI.getOperand(0).getReg();
1861   Register Src = MI.getOperand(1).getReg();
1862 
1863   const LLT S64 = LLT::scalar(64);
1864   const LLT S32 = LLT::scalar(32);
1865 
1866   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1867 
1868   unsigned Flags = MI.getFlags();
1869 
1870   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1871   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1872   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1873 
1874   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1875   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1876   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1877 
1878   auto Hi = Signed ?
1879     B.buildFPTOSI(S32, FloorMul) :
1880     B.buildFPTOUI(S32, FloorMul);
1881   auto Lo = B.buildFPTOUI(S32, Fma);
1882 
1883   B.buildMerge(Dst, { Lo, Hi });
1884   MI.eraseFromParent();
1885 
1886   return true;
1887 }
1888 
1889 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1890                                                MachineInstr &MI) const {
1891   MachineFunction &MF = Helper.MIRBuilder.getMF();
1892   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1893 
1894   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1895                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1896 
1897   // With ieee_mode disabled, the instructions have the correct behavior
1898   // already for G_FMINNUM/G_FMAXNUM
1899   if (!MFI->getMode().IEEE)
1900     return !IsIEEEOp;
1901 
1902   if (IsIEEEOp)
1903     return true;
1904 
1905   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1906 }
1907 
1908 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1909   MachineInstr &MI, MachineRegisterInfo &MRI,
1910   MachineIRBuilder &B) const {
1911   // TODO: Should move some of this into LegalizerHelper.
1912 
1913   // TODO: Promote dynamic indexing of s16 to s32
1914 
1915   // FIXME: Artifact combiner probably should have replaced the truncated
1916   // constant before this, so we shouldn't need
1917   // getConstantVRegValWithLookThrough.
1918   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1919     MI.getOperand(2).getReg(), MRI);
1920   if (!IdxVal) // Dynamic case will be selected to register indexing.
1921     return true;
1922 
1923   Register Dst = MI.getOperand(0).getReg();
1924   Register Vec = MI.getOperand(1).getReg();
1925 
1926   LLT VecTy = MRI.getType(Vec);
1927   LLT EltTy = VecTy.getElementType();
1928   assert(EltTy == MRI.getType(Dst));
1929 
1930   if (IdxVal->Value < VecTy.getNumElements())
1931     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1932   else
1933     B.buildUndef(Dst);
1934 
1935   MI.eraseFromParent();
1936   return true;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1940   MachineInstr &MI, MachineRegisterInfo &MRI,
1941   MachineIRBuilder &B) const {
1942   // TODO: Should move some of this into LegalizerHelper.
1943 
1944   // TODO: Promote dynamic indexing of s16 to s32
1945 
1946   // FIXME: Artifact combiner probably should have replaced the truncated
1947   // constant before this, so we shouldn't need
1948   // getConstantVRegValWithLookThrough.
1949   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1950     MI.getOperand(3).getReg(), MRI);
1951   if (!IdxVal) // Dynamic case will be selected to register indexing.
1952     return true;
1953 
1954   Register Dst = MI.getOperand(0).getReg();
1955   Register Vec = MI.getOperand(1).getReg();
1956   Register Ins = MI.getOperand(2).getReg();
1957 
1958   LLT VecTy = MRI.getType(Vec);
1959   LLT EltTy = VecTy.getElementType();
1960   assert(EltTy == MRI.getType(Ins));
1961 
1962   if (IdxVal->Value < VecTy.getNumElements())
1963     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1964   else
1965     B.buildUndef(Dst);
1966 
1967   MI.eraseFromParent();
1968   return true;
1969 }
1970 
1971 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1972   MachineInstr &MI, MachineRegisterInfo &MRI,
1973   MachineIRBuilder &B) const {
1974   const LLT V2S16 = LLT::vector(2, 16);
1975 
1976   Register Dst = MI.getOperand(0).getReg();
1977   Register Src0 = MI.getOperand(1).getReg();
1978   LLT DstTy = MRI.getType(Dst);
1979   LLT SrcTy = MRI.getType(Src0);
1980 
1981   if (SrcTy == V2S16 && DstTy == V2S16 &&
1982       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1983     return true;
1984 
1985   MachineIRBuilder HelperBuilder(MI);
1986   GISelObserverWrapper DummyObserver;
1987   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1988   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1989 }
1990 
1991 bool AMDGPULegalizerInfo::legalizeSinCos(
1992   MachineInstr &MI, MachineRegisterInfo &MRI,
1993   MachineIRBuilder &B) const {
1994 
1995   Register DstReg = MI.getOperand(0).getReg();
1996   Register SrcReg = MI.getOperand(1).getReg();
1997   LLT Ty = MRI.getType(DstReg);
1998   unsigned Flags = MI.getFlags();
1999 
2000   Register TrigVal;
2001   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2002   if (ST.hasTrigReducedRange()) {
2003     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2004     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2005       .addUse(MulVal.getReg(0))
2006       .setMIFlags(Flags).getReg(0);
2007   } else
2008     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2009 
2010   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2011     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2012   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2013     .addUse(TrigVal)
2014     .setMIFlags(Flags);
2015   MI.eraseFromParent();
2016   return true;
2017 }
2018 
2019 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2020                                                   MachineIRBuilder &B,
2021                                                   const GlobalValue *GV,
2022                                                   int64_t Offset,
2023                                                   unsigned GAFlags) const {
2024   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2025   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2026   // to the following code sequence:
2027   //
2028   // For constant address space:
2029   //   s_getpc_b64 s[0:1]
2030   //   s_add_u32 s0, s0, $symbol
2031   //   s_addc_u32 s1, s1, 0
2032   //
2033   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2034   //   a fixup or relocation is emitted to replace $symbol with a literal
2035   //   constant, which is a pc-relative offset from the encoding of the $symbol
2036   //   operand to the global variable.
2037   //
2038   // For global address space:
2039   //   s_getpc_b64 s[0:1]
2040   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2041   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2042   //
2043   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2044   //   fixups or relocations are emitted to replace $symbol@*@lo and
2045   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2046   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2047   //   operand to the global variable.
2048   //
2049   // What we want here is an offset from the value returned by s_getpc
2050   // (which is the address of the s_add_u32 instruction) to the global
2051   // variable, but since the encoding of $symbol starts 4 bytes after the start
2052   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2053   // small. This requires us to add 4 to the global variable offset in order to
2054   // compute the correct address.
2055 
2056   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2057 
2058   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2059     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2060 
2061   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2062     .addDef(PCReg);
2063 
2064   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2065   if (GAFlags == SIInstrInfo::MO_NONE)
2066     MIB.addImm(0);
2067   else
2068     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2069 
2070   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2071 
2072   if (PtrTy.getSizeInBits() == 32)
2073     B.buildExtract(DstReg, PCReg, 0);
2074   return true;
2075  }
2076 
2077 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2078   MachineInstr &MI, MachineRegisterInfo &MRI,
2079   MachineIRBuilder &B) const {
2080   Register DstReg = MI.getOperand(0).getReg();
2081   LLT Ty = MRI.getType(DstReg);
2082   unsigned AS = Ty.getAddressSpace();
2083 
2084   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2085   MachineFunction &MF = B.getMF();
2086   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2087 
2088   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2089     if (!MFI->isEntryFunction()) {
2090       const Function &Fn = MF.getFunction();
2091       DiagnosticInfoUnsupported BadLDSDecl(
2092         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2093         DS_Warning);
2094       Fn.getContext().diagnose(BadLDSDecl);
2095 
2096       // We currently don't have a way to correctly allocate LDS objects that
2097       // aren't directly associated with a kernel. We do force inlining of
2098       // functions that use local objects. However, if these dead functions are
2099       // not eliminated, we don't want a compile time error. Just emit a warning
2100       // and a trap, since there should be no callable path here.
2101       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2102       B.buildUndef(DstReg);
2103       MI.eraseFromParent();
2104       return true;
2105     }
2106 
2107     // TODO: We could emit code to handle the initialization somewhere.
2108     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2109       const SITargetLowering *TLI = ST.getTargetLowering();
2110       if (!TLI->shouldUseLDSConstAddress(GV)) {
2111         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2112         return true; // Leave in place;
2113       }
2114 
2115       B.buildConstant(
2116           DstReg,
2117           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2118       MI.eraseFromParent();
2119       return true;
2120     }
2121 
2122     const Function &Fn = MF.getFunction();
2123     DiagnosticInfoUnsupported BadInit(
2124       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2125     Fn.getContext().diagnose(BadInit);
2126     return true;
2127   }
2128 
2129   const SITargetLowering *TLI = ST.getTargetLowering();
2130 
2131   if (TLI->shouldEmitFixup(GV)) {
2132     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2133     MI.eraseFromParent();
2134     return true;
2135   }
2136 
2137   if (TLI->shouldEmitPCReloc(GV)) {
2138     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2139     MI.eraseFromParent();
2140     return true;
2141   }
2142 
2143   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2144   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2145 
2146   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2147       MachinePointerInfo::getGOT(MF),
2148       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2149           MachineMemOperand::MOInvariant,
2150       8 /*Size*/, Align(8));
2151 
2152   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2153 
2154   if (Ty.getSizeInBits() == 32) {
2155     // Truncate if this is a 32-bit constant adrdess.
2156     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2157     B.buildExtract(DstReg, Load, 0);
2158   } else
2159     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2160 
2161   MI.eraseFromParent();
2162   return true;
2163 }
2164 
2165 bool AMDGPULegalizerInfo::legalizeLoad(
2166   MachineInstr &MI, MachineRegisterInfo &MRI,
2167   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2168   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2169   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2170   Observer.changingInstr(MI);
2171   MI.getOperand(1).setReg(Cast.getReg(0));
2172   Observer.changedInstr(MI);
2173   return true;
2174 }
2175 
2176 bool AMDGPULegalizerInfo::legalizeFMad(
2177   MachineInstr &MI, MachineRegisterInfo &MRI,
2178   MachineIRBuilder &B) const {
2179   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2180   assert(Ty.isScalar());
2181 
2182   MachineFunction &MF = B.getMF();
2183   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2184 
2185   // TODO: Always legal with future ftz flag.
2186   // FIXME: Do we need just output?
2187   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2188     return true;
2189   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2190     return true;
2191 
2192   MachineIRBuilder HelperBuilder(MI);
2193   GISelObserverWrapper DummyObserver;
2194   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2195   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2196 }
2197 
2198 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2199   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2200   Register DstReg = MI.getOperand(0).getReg();
2201   Register PtrReg = MI.getOperand(1).getReg();
2202   Register CmpVal = MI.getOperand(2).getReg();
2203   Register NewVal = MI.getOperand(3).getReg();
2204 
2205   assert(SITargetLowering::isFlatGlobalAddrSpace(
2206            MRI.getType(PtrReg).getAddressSpace()) &&
2207          "this should not have been custom lowered");
2208 
2209   LLT ValTy = MRI.getType(CmpVal);
2210   LLT VecTy = LLT::vector(2, ValTy);
2211 
2212   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2213 
2214   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2215     .addDef(DstReg)
2216     .addUse(PtrReg)
2217     .addUse(PackedVal)
2218     .setMemRefs(MI.memoperands());
2219 
2220   MI.eraseFromParent();
2221   return true;
2222 }
2223 
2224 bool AMDGPULegalizerInfo::legalizeFlog(
2225   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2226   Register Dst = MI.getOperand(0).getReg();
2227   Register Src = MI.getOperand(1).getReg();
2228   LLT Ty = B.getMRI()->getType(Dst);
2229   unsigned Flags = MI.getFlags();
2230 
2231   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2232   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2233 
2234   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2235   MI.eraseFromParent();
2236   return true;
2237 }
2238 
2239 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2240                                        MachineIRBuilder &B) const {
2241   Register Dst = MI.getOperand(0).getReg();
2242   Register Src = MI.getOperand(1).getReg();
2243   unsigned Flags = MI.getFlags();
2244   LLT Ty = B.getMRI()->getType(Dst);
2245 
2246   auto K = B.buildFConstant(Ty, numbers::log2e);
2247   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2248   B.buildFExp2(Dst, Mul, Flags);
2249   MI.eraseFromParent();
2250   return true;
2251 }
2252 
2253 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2254                                        MachineIRBuilder &B) const {
2255   Register Dst = MI.getOperand(0).getReg();
2256   Register Src0 = MI.getOperand(1).getReg();
2257   Register Src1 = MI.getOperand(2).getReg();
2258   unsigned Flags = MI.getFlags();
2259   LLT Ty = B.getMRI()->getType(Dst);
2260   const LLT S16 = LLT::scalar(16);
2261   const LLT S32 = LLT::scalar(32);
2262 
2263   if (Ty == S32) {
2264     auto Log = B.buildFLog2(S32, Src0, Flags);
2265     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2266       .addUse(Log.getReg(0))
2267       .addUse(Src1)
2268       .setMIFlags(Flags);
2269     B.buildFExp2(Dst, Mul, Flags);
2270   } else if (Ty == S16) {
2271     // There's no f16 fmul_legacy, so we need to convert for it.
2272     auto Log = B.buildFLog2(S16, Src0, Flags);
2273     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2274     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2275     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2276       .addUse(Ext0.getReg(0))
2277       .addUse(Ext1.getReg(0))
2278       .setMIFlags(Flags);
2279 
2280     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2281   } else
2282     return false;
2283 
2284   MI.eraseFromParent();
2285   return true;
2286 }
2287 
2288 // Find a source register, ignoring any possible source modifiers.
2289 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2290   Register ModSrc = OrigSrc;
2291   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2292     ModSrc = SrcFNeg->getOperand(1).getReg();
2293     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2294       ModSrc = SrcFAbs->getOperand(1).getReg();
2295   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2296     ModSrc = SrcFAbs->getOperand(1).getReg();
2297   return ModSrc;
2298 }
2299 
2300 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2301                                          MachineRegisterInfo &MRI,
2302                                          MachineIRBuilder &B) const {
2303 
2304   const LLT S1 = LLT::scalar(1);
2305   const LLT S64 = LLT::scalar(64);
2306   Register Dst = MI.getOperand(0).getReg();
2307   Register OrigSrc = MI.getOperand(1).getReg();
2308   unsigned Flags = MI.getFlags();
2309   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2310          "this should not have been custom lowered");
2311 
2312   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2313   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2314   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2315   // V_FRACT bug is:
2316   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2317   //
2318   // Convert floor(x) to (x - fract(x))
2319 
2320   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2321     .addUse(OrigSrc)
2322     .setMIFlags(Flags);
2323 
2324   // Give source modifier matching some assistance before obscuring a foldable
2325   // pattern.
2326 
2327   // TODO: We can avoid the neg on the fract? The input sign to fract
2328   // shouldn't matter?
2329   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2330 
2331   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2332 
2333   Register Min = MRI.createGenericVirtualRegister(S64);
2334 
2335   // We don't need to concern ourselves with the snan handling difference, so
2336   // use the one which will directly select.
2337   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2338   if (MFI->getMode().IEEE)
2339     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2340   else
2341     B.buildFMinNum(Min, Fract, Const, Flags);
2342 
2343   Register CorrectedFract = Min;
2344   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2345     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2346     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2347   }
2348 
2349   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2350   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2351 
2352   MI.eraseFromParent();
2353   return true;
2354 }
2355 
2356 // Turn an illegal packed v2s16 build vector into bit operations.
2357 // TODO: This should probably be a bitcast action in LegalizerHelper.
2358 bool AMDGPULegalizerInfo::legalizeBuildVector(
2359   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2360   Register Dst = MI.getOperand(0).getReg();
2361   const LLT S32 = LLT::scalar(32);
2362   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2363 
2364   Register Src0 = MI.getOperand(1).getReg();
2365   Register Src1 = MI.getOperand(2).getReg();
2366   assert(MRI.getType(Src0) == LLT::scalar(16));
2367 
2368   auto Merge = B.buildMerge(S32, {Src0, Src1});
2369   B.buildBitcast(Dst, Merge);
2370 
2371   MI.eraseFromParent();
2372   return true;
2373 }
2374 
2375 // Return the use branch instruction, otherwise null if the usage is invalid.
2376 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2377                                        MachineRegisterInfo &MRI,
2378                                        MachineInstr *&Br,
2379                                        MachineBasicBlock *&UncondBrTarget) {
2380   Register CondDef = MI.getOperand(0).getReg();
2381   if (!MRI.hasOneNonDBGUse(CondDef))
2382     return nullptr;
2383 
2384   MachineBasicBlock *Parent = MI.getParent();
2385   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2386   if (UseMI.getParent() != Parent ||
2387       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2388     return nullptr;
2389 
2390   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2391   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2392   if (Next == Parent->end()) {
2393     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2394     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2395       return nullptr;
2396     UncondBrTarget = &*NextMBB;
2397   } else {
2398     if (Next->getOpcode() != AMDGPU::G_BR)
2399       return nullptr;
2400     Br = &*Next;
2401     UncondBrTarget = Br->getOperand(0).getMBB();
2402   }
2403 
2404   return &UseMI;
2405 }
2406 
2407 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2408                                                MachineRegisterInfo &MRI,
2409                                                Register LiveIn,
2410                                                Register PhyReg) const {
2411   assert(PhyReg.isPhysical() && "Physical register expected");
2412 
2413   // Insert the live-in copy, if required, by defining destination virtual
2414   // register.
2415   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2416   if (!MRI.getVRegDef(LiveIn)) {
2417     // FIXME: Should have scoped insert pt
2418     MachineBasicBlock &OrigInsBB = B.getMBB();
2419     auto OrigInsPt = B.getInsertPt();
2420 
2421     MachineBasicBlock &EntryMBB = B.getMF().front();
2422     EntryMBB.addLiveIn(PhyReg);
2423     B.setInsertPt(EntryMBB, EntryMBB.begin());
2424     B.buildCopy(LiveIn, PhyReg);
2425 
2426     B.setInsertPt(OrigInsBB, OrigInsPt);
2427   }
2428 
2429   return LiveIn;
2430 }
2431 
2432 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2433                                                 MachineRegisterInfo &MRI,
2434                                                 Register PhyReg, LLT Ty,
2435                                                 bool InsertLiveInCopy) const {
2436   assert(PhyReg.isPhysical() && "Physical register expected");
2437 
2438   // Get or create virtual live-in regester
2439   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2440   if (!LiveIn) {
2441     LiveIn = MRI.createGenericVirtualRegister(Ty);
2442     MRI.addLiveIn(PhyReg, LiveIn);
2443   }
2444 
2445   // When the actual true copy required is from virtual register to physical
2446   // register (to be inserted later), live-in copy insertion from physical
2447   // to register virtual register is not required
2448   if (!InsertLiveInCopy)
2449     return LiveIn;
2450 
2451   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2452 }
2453 
2454 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2455     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2456   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2457   const ArgDescriptor *Arg;
2458   const TargetRegisterClass *RC;
2459   LLT ArgTy;
2460   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2461   if (!Arg) {
2462     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2463     return nullptr;
2464   }
2465   return Arg;
2466 }
2467 
2468 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2469                                          const ArgDescriptor *Arg) const {
2470   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2471     return false; // TODO: Handle these
2472 
2473   Register SrcReg = Arg->getRegister();
2474   assert(SrcReg.isPhysical() && "Physical register expected");
2475   assert(DstReg.isVirtual() && "Virtual register expected");
2476 
2477   MachineRegisterInfo &MRI = *B.getMRI();
2478 
2479   LLT Ty = MRI.getType(DstReg);
2480   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2481 
2482   if (Arg->isMasked()) {
2483     // TODO: Should we try to emit this once in the entry block?
2484     const LLT S32 = LLT::scalar(32);
2485     const unsigned Mask = Arg->getMask();
2486     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2487 
2488     Register AndMaskSrc = LiveIn;
2489 
2490     if (Shift != 0) {
2491       auto ShiftAmt = B.buildConstant(S32, Shift);
2492       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2493     }
2494 
2495     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2496   } else {
2497     B.buildCopy(DstReg, LiveIn);
2498   }
2499 
2500   return true;
2501 }
2502 
2503 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2504     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2505     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2506 
2507   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2508   if (!Arg)
2509     return false;
2510 
2511   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2512     return false;
2513 
2514   MI.eraseFromParent();
2515   return true;
2516 }
2517 
2518 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2519                                        MachineRegisterInfo &MRI,
2520                                        MachineIRBuilder &B) const {
2521   Register Dst = MI.getOperand(0).getReg();
2522   LLT DstTy = MRI.getType(Dst);
2523   LLT S16 = LLT::scalar(16);
2524   LLT S32 = LLT::scalar(32);
2525   LLT S64 = LLT::scalar(64);
2526 
2527   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2528     return true;
2529 
2530   if (DstTy == S16)
2531     return legalizeFDIV16(MI, MRI, B);
2532   if (DstTy == S32)
2533     return legalizeFDIV32(MI, MRI, B);
2534   if (DstTy == S64)
2535     return legalizeFDIV64(MI, MRI, B);
2536 
2537   return false;
2538 }
2539 
2540 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2541                                                   Register DstReg,
2542                                                   Register X,
2543                                                   Register Y,
2544                                                   bool IsDiv) const {
2545   const LLT S1 = LLT::scalar(1);
2546   const LLT S32 = LLT::scalar(32);
2547 
2548   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2549   // algorithm used here.
2550 
2551   // Initial estimate of inv(y).
2552   auto FloatY = B.buildUITOFP(S32, Y);
2553   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2554   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2555   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2556   auto Z = B.buildFPTOUI(S32, ScaledY);
2557 
2558   // One round of UNR.
2559   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2560   auto NegYZ = B.buildMul(S32, NegY, Z);
2561   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2562 
2563   // Quotient/remainder estimate.
2564   auto Q = B.buildUMulH(S32, X, Z);
2565   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2566 
2567   // First quotient/remainder refinement.
2568   auto One = B.buildConstant(S32, 1);
2569   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2570   if (IsDiv)
2571     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2572   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2573 
2574   // Second quotient/remainder refinement.
2575   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2576   if (IsDiv)
2577     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2578   else
2579     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2580 }
2581 
2582 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2583                                               MachineRegisterInfo &MRI,
2584                                               MachineIRBuilder &B) const {
2585   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2586   Register DstReg = MI.getOperand(0).getReg();
2587   Register Num = MI.getOperand(1).getReg();
2588   Register Den = MI.getOperand(2).getReg();
2589   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2590   MI.eraseFromParent();
2591   return true;
2592 }
2593 
2594 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2595 //
2596 // Return lo, hi of result
2597 //
2598 // %cvt.lo = G_UITOFP Val.lo
2599 // %cvt.hi = G_UITOFP Val.hi
2600 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2601 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2602 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2603 // %mul2 = G_FMUL %mul1, 2**(-32)
2604 // %trunc = G_INTRINSIC_TRUNC %mul2
2605 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2606 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2607 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2608                                                        Register Val) {
2609   const LLT S32 = LLT::scalar(32);
2610   auto Unmerge = B.buildUnmerge(S32, Val);
2611 
2612   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2613   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2614 
2615   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2616                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2617 
2618   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2619   auto Mul1 =
2620       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2621 
2622   // 2**(-32)
2623   auto Mul2 =
2624       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2625   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2626 
2627   // -(2**32)
2628   auto Mad2 = B.buildFMAD(S32, Trunc,
2629                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2630 
2631   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2632   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2633 
2634   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2635 }
2636 
2637 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2638                                                   Register DstReg,
2639                                                   Register Numer,
2640                                                   Register Denom,
2641                                                   bool IsDiv) const {
2642   const LLT S32 = LLT::scalar(32);
2643   const LLT S64 = LLT::scalar(64);
2644   const LLT S1 = LLT::scalar(1);
2645   Register RcpLo, RcpHi;
2646 
2647   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2648 
2649   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2650 
2651   auto Zero64 = B.buildConstant(S64, 0);
2652   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2653 
2654   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2655   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2656 
2657   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2658   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2659   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2660 
2661   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2662   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2663   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2664   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2665 
2666   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2667   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2668   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2669   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2670   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2671 
2672   auto Zero32 = B.buildConstant(S32, 0);
2673   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2674   auto Add2_HiC =
2675       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2676   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2677   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2678 
2679   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2680   Register NumerLo = UnmergeNumer.getReg(0);
2681   Register NumerHi = UnmergeNumer.getReg(1);
2682 
2683   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2684   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2685   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2686   Register Mul3_Lo = UnmergeMul3.getReg(0);
2687   Register Mul3_Hi = UnmergeMul3.getReg(1);
2688   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2689   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2690   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2691   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2692 
2693   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2694   Register DenomLo = UnmergeDenom.getReg(0);
2695   Register DenomHi = UnmergeDenom.getReg(1);
2696 
2697   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2698   auto C1 = B.buildSExt(S32, CmpHi);
2699 
2700   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2701   auto C2 = B.buildSExt(S32, CmpLo);
2702 
2703   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2704   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2705 
2706   // TODO: Here and below portions of the code can be enclosed into if/endif.
2707   // Currently control flow is unconditional and we have 4 selects after
2708   // potential endif to substitute PHIs.
2709 
2710   // if C3 != 0 ...
2711   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2712   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2713   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2714   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2715 
2716   auto One64 = B.buildConstant(S64, 1);
2717   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2718 
2719   auto C4 =
2720       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2721   auto C5 =
2722       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2723   auto C6 = B.buildSelect(
2724       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2725 
2726   // if (C6 != 0)
2727   auto Add4 = B.buildAdd(S64, Add3, One64);
2728   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2729 
2730   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2731   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2732   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2733 
2734   // endif C6
2735   // endif C3
2736 
2737   if (IsDiv) {
2738     auto Sel1 = B.buildSelect(
2739         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2740     B.buildSelect(DstReg,
2741                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2742   } else {
2743     auto Sel2 = B.buildSelect(
2744         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2745     B.buildSelect(DstReg,
2746                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2747   }
2748 }
2749 
2750 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2751                                             MachineRegisterInfo &MRI,
2752                                             MachineIRBuilder &B) const {
2753   const LLT S64 = LLT::scalar(64);
2754   const LLT S32 = LLT::scalar(32);
2755   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2756   Register DstReg = MI.getOperand(0).getReg();
2757   Register Num = MI.getOperand(1).getReg();
2758   Register Den = MI.getOperand(2).getReg();
2759   LLT Ty = MRI.getType(DstReg);
2760 
2761   if (Ty == S32)
2762     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2763   else if (Ty == S64)
2764     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2765   else
2766     return false;
2767 
2768   MI.eraseFromParent();
2769   return true;
2770 
2771 }
2772 
2773 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2774                                             MachineRegisterInfo &MRI,
2775                                             MachineIRBuilder &B) const {
2776   const LLT S64 = LLT::scalar(64);
2777   const LLT S32 = LLT::scalar(32);
2778 
2779   Register DstReg = MI.getOperand(0).getReg();
2780   const LLT Ty = MRI.getType(DstReg);
2781   if (Ty != S32 && Ty != S64)
2782     return false;
2783 
2784   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2785 
2786   Register LHS = MI.getOperand(1).getReg();
2787   Register RHS = MI.getOperand(2).getReg();
2788 
2789   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2790   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2791   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2792 
2793   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2794   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2795 
2796   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2797   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2798 
2799   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2800   if (Ty == S32)
2801     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2802   else
2803     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2804 
2805   Register Sign;
2806   if (IsDiv)
2807     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2808   else
2809     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2810 
2811   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2812   B.buildSub(DstReg, UDivRem, Sign);
2813 
2814   MI.eraseFromParent();
2815   return true;
2816 }
2817 
2818 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2819                                                  MachineRegisterInfo &MRI,
2820                                                  MachineIRBuilder &B) const {
2821   Register Res = MI.getOperand(0).getReg();
2822   Register LHS = MI.getOperand(1).getReg();
2823   Register RHS = MI.getOperand(2).getReg();
2824 
2825   uint16_t Flags = MI.getFlags();
2826 
2827   LLT ResTy = MRI.getType(Res);
2828   LLT S32 = LLT::scalar(32);
2829   LLT S64 = LLT::scalar(64);
2830 
2831   const MachineFunction &MF = B.getMF();
2832   bool Unsafe =
2833     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2834 
2835   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2836     return false;
2837 
2838   if (!Unsafe && ResTy == S32 &&
2839       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2840     return false;
2841 
2842   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2843     // 1 / x -> RCP(x)
2844     if (CLHS->isExactlyValue(1.0)) {
2845       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2846         .addUse(RHS)
2847         .setMIFlags(Flags);
2848 
2849       MI.eraseFromParent();
2850       return true;
2851     }
2852 
2853     // -1 / x -> RCP( FNEG(x) )
2854     if (CLHS->isExactlyValue(-1.0)) {
2855       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2856       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2857         .addUse(FNeg.getReg(0))
2858         .setMIFlags(Flags);
2859 
2860       MI.eraseFromParent();
2861       return true;
2862     }
2863   }
2864 
2865   // x / y -> x * (1.0 / y)
2866   if (Unsafe) {
2867     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2868       .addUse(RHS)
2869       .setMIFlags(Flags);
2870     B.buildFMul(Res, LHS, RCP, Flags);
2871 
2872     MI.eraseFromParent();
2873     return true;
2874   }
2875 
2876   return false;
2877 }
2878 
2879 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2880                                          MachineRegisterInfo &MRI,
2881                                          MachineIRBuilder &B) const {
2882   Register Res = MI.getOperand(0).getReg();
2883   Register LHS = MI.getOperand(1).getReg();
2884   Register RHS = MI.getOperand(2).getReg();
2885 
2886   uint16_t Flags = MI.getFlags();
2887 
2888   LLT S16 = LLT::scalar(16);
2889   LLT S32 = LLT::scalar(32);
2890 
2891   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2892   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2893 
2894   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2895     .addUse(RHSExt.getReg(0))
2896     .setMIFlags(Flags);
2897 
2898   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2899   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2900 
2901   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2902     .addUse(RDst.getReg(0))
2903     .addUse(RHS)
2904     .addUse(LHS)
2905     .setMIFlags(Flags);
2906 
2907   MI.eraseFromParent();
2908   return true;
2909 }
2910 
2911 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2912 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2913 static void toggleSPDenormMode(bool Enable,
2914                                MachineIRBuilder &B,
2915                                const GCNSubtarget &ST,
2916                                AMDGPU::SIModeRegisterDefaults Mode) {
2917   // Set SP denorm mode to this value.
2918   unsigned SPDenormMode =
2919     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2920 
2921   if (ST.hasDenormModeInst()) {
2922     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2923     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2924 
2925     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2926     B.buildInstr(AMDGPU::S_DENORM_MODE)
2927       .addImm(NewDenormModeValue);
2928 
2929   } else {
2930     // Select FP32 bit field in mode register.
2931     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2932                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2933                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2934 
2935     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2936       .addImm(SPDenormMode)
2937       .addImm(SPDenormModeBitField);
2938   }
2939 }
2940 
2941 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2942                                          MachineRegisterInfo &MRI,
2943                                          MachineIRBuilder &B) const {
2944   Register Res = MI.getOperand(0).getReg();
2945   Register LHS = MI.getOperand(1).getReg();
2946   Register RHS = MI.getOperand(2).getReg();
2947   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2948   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2949 
2950   uint16_t Flags = MI.getFlags();
2951 
2952   LLT S32 = LLT::scalar(32);
2953   LLT S1 = LLT::scalar(1);
2954 
2955   auto One = B.buildFConstant(S32, 1.0f);
2956 
2957   auto DenominatorScaled =
2958     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2959       .addUse(LHS)
2960       .addUse(RHS)
2961       .addImm(0)
2962       .setMIFlags(Flags);
2963   auto NumeratorScaled =
2964     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2965       .addUse(LHS)
2966       .addUse(RHS)
2967       .addImm(1)
2968       .setMIFlags(Flags);
2969 
2970   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2971     .addUse(DenominatorScaled.getReg(0))
2972     .setMIFlags(Flags);
2973   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2974 
2975   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2976   // aren't modeled as reading it.
2977   if (!Mode.allFP32Denormals())
2978     toggleSPDenormMode(true, B, ST, Mode);
2979 
2980   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2981   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2982   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2983   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2984   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2985   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2986 
2987   if (!Mode.allFP32Denormals())
2988     toggleSPDenormMode(false, B, ST, Mode);
2989 
2990   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2991     .addUse(Fma4.getReg(0))
2992     .addUse(Fma1.getReg(0))
2993     .addUse(Fma3.getReg(0))
2994     .addUse(NumeratorScaled.getReg(1))
2995     .setMIFlags(Flags);
2996 
2997   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2998     .addUse(Fmas.getReg(0))
2999     .addUse(RHS)
3000     .addUse(LHS)
3001     .setMIFlags(Flags);
3002 
3003   MI.eraseFromParent();
3004   return true;
3005 }
3006 
3007 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3008                                          MachineRegisterInfo &MRI,
3009                                          MachineIRBuilder &B) const {
3010   Register Res = MI.getOperand(0).getReg();
3011   Register LHS = MI.getOperand(1).getReg();
3012   Register RHS = MI.getOperand(2).getReg();
3013 
3014   uint16_t Flags = MI.getFlags();
3015 
3016   LLT S64 = LLT::scalar(64);
3017   LLT S1 = LLT::scalar(1);
3018 
3019   auto One = B.buildFConstant(S64, 1.0);
3020 
3021   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3022     .addUse(LHS)
3023     .addUse(RHS)
3024     .addImm(0)
3025     .setMIFlags(Flags);
3026 
3027   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3028 
3029   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3030     .addUse(DivScale0.getReg(0))
3031     .setMIFlags(Flags);
3032 
3033   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3034   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3035   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3036 
3037   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3038     .addUse(LHS)
3039     .addUse(RHS)
3040     .addImm(1)
3041     .setMIFlags(Flags);
3042 
3043   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3044   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3045   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3046 
3047   Register Scale;
3048   if (!ST.hasUsableDivScaleConditionOutput()) {
3049     // Workaround a hardware bug on SI where the condition output from div_scale
3050     // is not usable.
3051 
3052     LLT S32 = LLT::scalar(32);
3053 
3054     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3055     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3056     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3057     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3058 
3059     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3060                               Scale1Unmerge.getReg(1));
3061     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3062                               Scale0Unmerge.getReg(1));
3063     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3064   } else {
3065     Scale = DivScale1.getReg(1);
3066   }
3067 
3068   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3069     .addUse(Fma4.getReg(0))
3070     .addUse(Fma3.getReg(0))
3071     .addUse(Mul.getReg(0))
3072     .addUse(Scale)
3073     .setMIFlags(Flags);
3074 
3075   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3076     .addUse(Fmas.getReg(0))
3077     .addUse(RHS)
3078     .addUse(LHS)
3079     .setMIFlags(Flags);
3080 
3081   MI.eraseFromParent();
3082   return true;
3083 }
3084 
3085 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3086                                                  MachineRegisterInfo &MRI,
3087                                                  MachineIRBuilder &B) const {
3088   Register Res = MI.getOperand(0).getReg();
3089   Register LHS = MI.getOperand(2).getReg();
3090   Register RHS = MI.getOperand(3).getReg();
3091   uint16_t Flags = MI.getFlags();
3092 
3093   LLT S32 = LLT::scalar(32);
3094   LLT S1 = LLT::scalar(1);
3095 
3096   auto Abs = B.buildFAbs(S32, RHS, Flags);
3097   const APFloat C0Val(1.0f);
3098 
3099   auto C0 = B.buildConstant(S32, 0x6f800000);
3100   auto C1 = B.buildConstant(S32, 0x2f800000);
3101   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3102 
3103   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3104   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3105 
3106   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3107 
3108   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3109     .addUse(Mul0.getReg(0))
3110     .setMIFlags(Flags);
3111 
3112   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3113 
3114   B.buildFMul(Res, Sel, Mul1, Flags);
3115 
3116   MI.eraseFromParent();
3117   return true;
3118 }
3119 
3120 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3121                                             MachineRegisterInfo &MRI,
3122                                             MachineIRBuilder &B) const {
3123   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3124   uint64_t Offset =
3125     ST.getTargetLowering()->getImplicitParameterOffset(
3126       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3127   LLT DstTy = MRI.getType(DstReg);
3128   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3129 
3130   const ArgDescriptor *Arg;
3131   const TargetRegisterClass *RC;
3132   LLT ArgTy;
3133   std::tie(Arg, RC, ArgTy) =
3134       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3135   if (!Arg)
3136     return false;
3137 
3138   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3139   if (!loadInputValue(KernargPtrReg, B, Arg))
3140     return false;
3141 
3142   // FIXME: This should be nuw
3143   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3144   return true;
3145 }
3146 
3147 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3148                                                  MachineRegisterInfo &MRI,
3149                                                  MachineIRBuilder &B) const {
3150   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3151   if (!MFI->isEntryFunction()) {
3152     return legalizePreloadedArgIntrin(MI, MRI, B,
3153                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3154   }
3155 
3156   Register DstReg = MI.getOperand(0).getReg();
3157   if (!getImplicitArgPtr(DstReg, MRI, B))
3158     return false;
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3165                                               MachineRegisterInfo &MRI,
3166                                               MachineIRBuilder &B,
3167                                               unsigned AddrSpace) const {
3168   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3169   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3170   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3171   MI.eraseFromParent();
3172   return true;
3173 }
3174 
3175 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3176 // offset (the offset that is included in bounds checking and swizzling, to be
3177 // split between the instruction's voffset and immoffset fields) and soffset
3178 // (the offset that is excluded from bounds checking and swizzling, to go in
3179 // the instruction's soffset field).  This function takes the first kind of
3180 // offset and figures out how to split it between voffset and immoffset.
3181 std::tuple<Register, unsigned, unsigned>
3182 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3183                                         Register OrigOffset) const {
3184   const unsigned MaxImm = 4095;
3185   Register BaseReg;
3186   unsigned TotalConstOffset;
3187   MachineInstr *OffsetDef;
3188   const LLT S32 = LLT::scalar(32);
3189 
3190   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3191     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3192 
3193   unsigned ImmOffset = TotalConstOffset;
3194 
3195   // If the immediate value is too big for the immoffset field, put the value
3196   // and -4096 into the immoffset field so that the value that is copied/added
3197   // for the voffset field is a multiple of 4096, and it stands more chance
3198   // of being CSEd with the copy/add for another similar load/store.
3199   // However, do not do that rounding down to a multiple of 4096 if that is a
3200   // negative number, as it appears to be illegal to have a negative offset
3201   // in the vgpr, even if adding the immediate offset makes it positive.
3202   unsigned Overflow = ImmOffset & ~MaxImm;
3203   ImmOffset -= Overflow;
3204   if ((int32_t)Overflow < 0) {
3205     Overflow += ImmOffset;
3206     ImmOffset = 0;
3207   }
3208 
3209   if (Overflow != 0) {
3210     if (!BaseReg) {
3211       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3212     } else {
3213       auto OverflowVal = B.buildConstant(S32, Overflow);
3214       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3215     }
3216   }
3217 
3218   if (!BaseReg)
3219     BaseReg = B.buildConstant(S32, 0).getReg(0);
3220 
3221   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3222 }
3223 
3224 /// Handle register layout difference for f16 images for some subtargets.
3225 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3226                                              MachineRegisterInfo &MRI,
3227                                              Register Reg) const {
3228   if (!ST.hasUnpackedD16VMem())
3229     return Reg;
3230 
3231   const LLT S16 = LLT::scalar(16);
3232   const LLT S32 = LLT::scalar(32);
3233   LLT StoreVT = MRI.getType(Reg);
3234   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3235 
3236   auto Unmerge = B.buildUnmerge(S16, Reg);
3237 
3238   SmallVector<Register, 4> WideRegs;
3239   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3240     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3241 
3242   int NumElts = StoreVT.getNumElements();
3243 
3244   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3245 }
3246 
3247 Register AMDGPULegalizerInfo::fixStoreSourceType(
3248   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3249   MachineRegisterInfo *MRI = B.getMRI();
3250   LLT Ty = MRI->getType(VData);
3251 
3252   const LLT S16 = LLT::scalar(16);
3253 
3254   // Fixup illegal register types for i8 stores.
3255   if (Ty == LLT::scalar(8) || Ty == S16) {
3256     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3257     return AnyExt;
3258   }
3259 
3260   if (Ty.isVector()) {
3261     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3262       if (IsFormat)
3263         return handleD16VData(B, *MRI, VData);
3264     }
3265   }
3266 
3267   return VData;
3268 }
3269 
3270 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3271                                               MachineRegisterInfo &MRI,
3272                                               MachineIRBuilder &B,
3273                                               bool IsTyped,
3274                                               bool IsFormat) const {
3275   Register VData = MI.getOperand(1).getReg();
3276   LLT Ty = MRI.getType(VData);
3277   LLT EltTy = Ty.getScalarType();
3278   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3279   const LLT S32 = LLT::scalar(32);
3280 
3281   VData = fixStoreSourceType(B, VData, IsFormat);
3282   Register RSrc = MI.getOperand(2).getReg();
3283 
3284   MachineMemOperand *MMO = *MI.memoperands_begin();
3285   const int MemSize = MMO->getSize();
3286 
3287   unsigned ImmOffset;
3288   unsigned TotalOffset;
3289 
3290   // The typed intrinsics add an immediate after the registers.
3291   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3292 
3293   // The struct intrinsic variants add one additional operand over raw.
3294   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3295   Register VIndex;
3296   int OpOffset = 0;
3297   if (HasVIndex) {
3298     VIndex = MI.getOperand(3).getReg();
3299     OpOffset = 1;
3300   }
3301 
3302   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3303   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3304 
3305   unsigned Format = 0;
3306   if (IsTyped) {
3307     Format = MI.getOperand(5 + OpOffset).getImm();
3308     ++OpOffset;
3309   }
3310 
3311   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3312 
3313   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3314   if (TotalOffset != 0)
3315     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3316 
3317   unsigned Opc;
3318   if (IsTyped) {
3319     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3320                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3321   } else if (IsFormat) {
3322     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3323                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3324   } else {
3325     switch (MemSize) {
3326     case 1:
3327       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3328       break;
3329     case 2:
3330       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3331       break;
3332     default:
3333       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3334       break;
3335     }
3336   }
3337 
3338   if (!VIndex)
3339     VIndex = B.buildConstant(S32, 0).getReg(0);
3340 
3341   auto MIB = B.buildInstr(Opc)
3342     .addUse(VData)              // vdata
3343     .addUse(RSrc)               // rsrc
3344     .addUse(VIndex)             // vindex
3345     .addUse(VOffset)            // voffset
3346     .addUse(SOffset)            // soffset
3347     .addImm(ImmOffset);         // offset(imm)
3348 
3349   if (IsTyped)
3350     MIB.addImm(Format);
3351 
3352   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3353      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3354      .addMemOperand(MMO);
3355 
3356   MI.eraseFromParent();
3357   return true;
3358 }
3359 
3360 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3361                                              MachineRegisterInfo &MRI,
3362                                              MachineIRBuilder &B,
3363                                              bool IsFormat,
3364                                              bool IsTyped) const {
3365   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3366   MachineMemOperand *MMO = *MI.memoperands_begin();
3367   const int MemSize = MMO->getSize();
3368   const LLT S32 = LLT::scalar(32);
3369 
3370   Register Dst = MI.getOperand(0).getReg();
3371   Register RSrc = MI.getOperand(2).getReg();
3372 
3373   // The typed intrinsics add an immediate after the registers.
3374   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3375 
3376   // The struct intrinsic variants add one additional operand over raw.
3377   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3378   Register VIndex;
3379   int OpOffset = 0;
3380   if (HasVIndex) {
3381     VIndex = MI.getOperand(3).getReg();
3382     OpOffset = 1;
3383   }
3384 
3385   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3386   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3387 
3388   unsigned Format = 0;
3389   if (IsTyped) {
3390     Format = MI.getOperand(5 + OpOffset).getImm();
3391     ++OpOffset;
3392   }
3393 
3394   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3395   unsigned ImmOffset;
3396   unsigned TotalOffset;
3397 
3398   LLT Ty = MRI.getType(Dst);
3399   LLT EltTy = Ty.getScalarType();
3400   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3401   const bool Unpacked = ST.hasUnpackedD16VMem();
3402 
3403   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3404   if (TotalOffset != 0)
3405     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3406 
3407   unsigned Opc;
3408 
3409   if (IsTyped) {
3410     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3411                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3412   } else if (IsFormat) {
3413     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3414                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3415   } else {
3416     switch (MemSize) {
3417     case 1:
3418       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3419       break;
3420     case 2:
3421       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3422       break;
3423     default:
3424       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3425       break;
3426     }
3427   }
3428 
3429   Register LoadDstReg;
3430 
3431   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3432   LLT UnpackedTy = Ty.changeElementSize(32);
3433 
3434   if (IsExtLoad)
3435     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3436   else if (Unpacked && IsD16 && Ty.isVector())
3437     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3438   else
3439     LoadDstReg = Dst;
3440 
3441   if (!VIndex)
3442     VIndex = B.buildConstant(S32, 0).getReg(0);
3443 
3444   auto MIB = B.buildInstr(Opc)
3445     .addDef(LoadDstReg)         // vdata
3446     .addUse(RSrc)               // rsrc
3447     .addUse(VIndex)             // vindex
3448     .addUse(VOffset)            // voffset
3449     .addUse(SOffset)            // soffset
3450     .addImm(ImmOffset);         // offset(imm)
3451 
3452   if (IsTyped)
3453     MIB.addImm(Format);
3454 
3455   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3456      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3457      .addMemOperand(MMO);
3458 
3459   if (LoadDstReg != Dst) {
3460     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3461 
3462     // Widen result for extending loads was widened.
3463     if (IsExtLoad)
3464       B.buildTrunc(Dst, LoadDstReg);
3465     else {
3466       // Repack to original 16-bit vector result
3467       // FIXME: G_TRUNC should work, but legalization currently fails
3468       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3469       SmallVector<Register, 4> Repack;
3470       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3471         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3472       B.buildMerge(Dst, Repack);
3473     }
3474   }
3475 
3476   MI.eraseFromParent();
3477   return true;
3478 }
3479 
3480 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3481                                                MachineIRBuilder &B,
3482                                                bool IsInc) const {
3483   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3484                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3485   B.buildInstr(Opc)
3486     .addDef(MI.getOperand(0).getReg())
3487     .addUse(MI.getOperand(2).getReg())
3488     .addUse(MI.getOperand(3).getReg())
3489     .cloneMemRefs(MI);
3490   MI.eraseFromParent();
3491   return true;
3492 }
3493 
3494 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3495   switch (IntrID) {
3496   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3497   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3498     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3499   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3500   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3501     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3502   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3503   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3504     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3505   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3506   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3507     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3508   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3509   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3510     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3511   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3512   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3513     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3514   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3515   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3516     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3517   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3518   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3519     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3520   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3521   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3522     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3523   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3524   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3525     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3526   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3527   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3528     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3529   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3530   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3531     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3532   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3533   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3534     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3535   default:
3536     llvm_unreachable("unhandled atomic opcode");
3537   }
3538 }
3539 
3540 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3541                                                MachineIRBuilder &B,
3542                                                Intrinsic::ID IID) const {
3543   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3544                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3545 
3546   Register Dst = MI.getOperand(0).getReg();
3547   Register VData = MI.getOperand(2).getReg();
3548 
3549   Register CmpVal;
3550   int OpOffset = 0;
3551 
3552   if (IsCmpSwap) {
3553     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3554     ++OpOffset;
3555   }
3556 
3557   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3558   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3559 
3560   // The struct intrinsic variants add one additional operand over raw.
3561   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3562   Register VIndex;
3563   if (HasVIndex) {
3564     VIndex = MI.getOperand(4 + OpOffset).getReg();
3565     ++OpOffset;
3566   }
3567 
3568   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3569   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3570   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3571 
3572   MachineMemOperand *MMO = *MI.memoperands_begin();
3573 
3574   unsigned ImmOffset;
3575   unsigned TotalOffset;
3576   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3577   if (TotalOffset != 0)
3578     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3579 
3580   if (!VIndex)
3581     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3582 
3583   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3584     .addDef(Dst)
3585     .addUse(VData); // vdata
3586 
3587   if (IsCmpSwap)
3588     MIB.addReg(CmpVal);
3589 
3590   MIB.addUse(RSrc)               // rsrc
3591      .addUse(VIndex)             // vindex
3592      .addUse(VOffset)            // voffset
3593      .addUse(SOffset)            // soffset
3594      .addImm(ImmOffset)          // offset(imm)
3595      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3596      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3597      .addMemOperand(MMO);
3598 
3599   MI.eraseFromParent();
3600   return true;
3601 }
3602 
3603 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3604 /// vector with s16 typed elements.
3605 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3606                                         SmallVectorImpl<Register> &PackedAddrs,
3607                                         int AddrIdx, int DimIdx, int EndIdx,
3608                                         int NumGradients) {
3609   const LLT S16 = LLT::scalar(16);
3610   const LLT V2S16 = LLT::vector(2, 16);
3611 
3612   for (int I = AddrIdx; I < EndIdx; ++I) {
3613     MachineOperand &SrcOp = MI.getOperand(I);
3614     if (!SrcOp.isReg())
3615       continue; // _L to _LZ may have eliminated this.
3616 
3617     Register AddrReg = SrcOp.getReg();
3618 
3619     if (I < DimIdx) {
3620       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3621       PackedAddrs.push_back(AddrReg);
3622     } else {
3623       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3624       // derivatives dx/dh and dx/dv are packed with undef.
3625       if (((I + 1) >= EndIdx) ||
3626           ((NumGradients / 2) % 2 == 1 &&
3627            (I == DimIdx + (NumGradients / 2) - 1 ||
3628             I == DimIdx + NumGradients - 1)) ||
3629           // Check for _L to _LZ optimization
3630           !MI.getOperand(I + 1).isReg()) {
3631         PackedAddrs.push_back(
3632             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3633                 .getReg(0));
3634       } else {
3635         PackedAddrs.push_back(
3636             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3637                 .getReg(0));
3638         ++I;
3639       }
3640     }
3641   }
3642 }
3643 
3644 /// Convert from separate vaddr components to a single vector address register,
3645 /// and replace the remaining operands with $noreg.
3646 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3647                                      int DimIdx, int NumVAddrs) {
3648   const LLT S32 = LLT::scalar(32);
3649 
3650   SmallVector<Register, 8> AddrRegs;
3651   for (int I = 0; I != NumVAddrs; ++I) {
3652     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3653     if (SrcOp.isReg()) {
3654       AddrRegs.push_back(SrcOp.getReg());
3655       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3656     }
3657   }
3658 
3659   int NumAddrRegs = AddrRegs.size();
3660   if (NumAddrRegs != 1) {
3661     // Round up to 8 elements for v5-v7
3662     // FIXME: Missing intermediate sized register classes and instructions.
3663     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3664       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3665       auto Undef = B.buildUndef(S32);
3666       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3667       NumAddrRegs = RoundedNumRegs;
3668     }
3669 
3670     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3671     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3672   }
3673 
3674   for (int I = 1; I != NumVAddrs; ++I) {
3675     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3676     if (SrcOp.isReg())
3677       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3678   }
3679 }
3680 
3681 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3682 ///
3683 /// Depending on the subtarget, load/store with 16-bit element data need to be
3684 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3685 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3686 /// registers.
3687 ///
3688 /// We don't want to directly select image instructions just yet, but also want
3689 /// to exposes all register repacking to the legalizer/combiners. We also don't
3690 /// want a selected instrution entering RegBankSelect. In order to avoid
3691 /// defining a multitude of intermediate image instructions, directly hack on
3692 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3693 /// now unnecessary arguments with $noreg.
3694 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3695     MachineInstr &MI, MachineIRBuilder &B,
3696     GISelChangeObserver &Observer,
3697     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3698 
3699   const int NumDefs = MI.getNumExplicitDefs();
3700   bool IsTFE = NumDefs == 2;
3701   // We are only processing the operands of d16 image operations on subtargets
3702   // that use the unpacked register layout, or need to repack the TFE result.
3703 
3704   // TODO: Do we need to guard against already legalized intrinsics?
3705   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3706     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3707 
3708   MachineRegisterInfo *MRI = B.getMRI();
3709   const LLT S32 = LLT::scalar(32);
3710   const LLT S16 = LLT::scalar(16);
3711   const LLT V2S16 = LLT::vector(2, 16);
3712 
3713   // Index of first address argument
3714   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3715 
3716   int NumVAddrs, NumGradients;
3717   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3718   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3719     getDMaskIdx(BaseOpcode, NumDefs);
3720   unsigned DMask = 0;
3721 
3722   // Check for 16 bit addresses and pack if true.
3723   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3724   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3725   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3726   const bool IsG16 = GradTy == S16;
3727   const bool IsA16 = AddrTy == S16;
3728 
3729   int DMaskLanes = 0;
3730   if (!BaseOpcode->Atomic) {
3731     DMask = MI.getOperand(DMaskIdx).getImm();
3732     if (BaseOpcode->Gather4) {
3733       DMaskLanes = 4;
3734     } else if (DMask != 0) {
3735       DMaskLanes = countPopulation(DMask);
3736     } else if (!IsTFE && !BaseOpcode->Store) {
3737       // If dmask is 0, this is a no-op load. This can be eliminated.
3738       B.buildUndef(MI.getOperand(0));
3739       MI.eraseFromParent();
3740       return true;
3741     }
3742   }
3743 
3744   Observer.changingInstr(MI);
3745   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3746 
3747   unsigned NewOpcode = NumDefs == 0 ?
3748     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3749 
3750   // Track that we legalized this
3751   MI.setDesc(B.getTII().get(NewOpcode));
3752 
3753   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3754   // dmask to be at least 1 otherwise the instruction will fail
3755   if (IsTFE && DMask == 0) {
3756     DMask = 0x1;
3757     DMaskLanes = 1;
3758     MI.getOperand(DMaskIdx).setImm(DMask);
3759   }
3760 
3761   if (BaseOpcode->Atomic) {
3762     Register VData0 = MI.getOperand(2).getReg();
3763     LLT Ty = MRI->getType(VData0);
3764 
3765     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3766     if (Ty.isVector())
3767       return false;
3768 
3769     if (BaseOpcode->AtomicX2) {
3770       Register VData1 = MI.getOperand(3).getReg();
3771       // The two values are packed in one register.
3772       LLT PackedTy = LLT::vector(2, Ty);
3773       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3774       MI.getOperand(2).setReg(Concat.getReg(0));
3775       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3776     }
3777   }
3778 
3779   int CorrectedNumVAddrs = NumVAddrs;
3780 
3781   // Optimize _L to _LZ when _L is zero
3782   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3783         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3784     const ConstantFP *ConstantLod;
3785     const int LodIdx = AddrIdx + NumVAddrs - 1;
3786 
3787     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3788       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3789         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3790         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3791           LZMappingInfo->LZ, ImageDimIntr->Dim);
3792 
3793         // The starting indexes should remain in the same place.
3794         --NumVAddrs;
3795         --CorrectedNumVAddrs;
3796 
3797         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3798           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3799         MI.RemoveOperand(LodIdx);
3800       }
3801     }
3802   }
3803 
3804   // Optimize _mip away, when 'lod' is zero
3805   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3806     int64_t ConstantLod;
3807     const int LodIdx = AddrIdx + NumVAddrs - 1;
3808 
3809     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3810       if (ConstantLod == 0) {
3811         // TODO: Change intrinsic opcode and remove operand instead or replacing
3812         // it with 0, as the _L to _LZ handling is done above.
3813         MI.getOperand(LodIdx).ChangeToImmediate(0);
3814         --CorrectedNumVAddrs;
3815       }
3816     }
3817   }
3818 
3819   // Rewrite the addressing register layout before doing anything else.
3820   if (IsA16 || IsG16) {
3821     if (IsA16) {
3822       // Target must support the feature and gradients need to be 16 bit too
3823       if (!ST.hasA16() || !IsG16)
3824         return false;
3825     } else if (!ST.hasG16())
3826       return false;
3827 
3828     if (NumVAddrs > 1) {
3829       SmallVector<Register, 4> PackedRegs;
3830       // Don't compress addresses for G16
3831       const int PackEndIdx =
3832           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3833       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3834                                   PackEndIdx, NumGradients);
3835 
3836       if (!IsA16) {
3837         // Add uncompressed address
3838         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3839           int AddrReg = MI.getOperand(I).getReg();
3840           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3841           PackedRegs.push_back(AddrReg);
3842         }
3843       }
3844 
3845       // See also below in the non-a16 branch
3846       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3847 
3848       if (!UseNSA && PackedRegs.size() > 1) {
3849         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3850         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3851         PackedRegs[0] = Concat.getReg(0);
3852         PackedRegs.resize(1);
3853       }
3854 
3855       const int NumPacked = PackedRegs.size();
3856       for (int I = 0; I != NumVAddrs; ++I) {
3857         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3858         if (!SrcOp.isReg()) {
3859           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3860           continue;
3861         }
3862 
3863         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3864 
3865         if (I < NumPacked)
3866           SrcOp.setReg(PackedRegs[I]);
3867         else
3868           SrcOp.setReg(AMDGPU::NoRegister);
3869       }
3870     }
3871   } else {
3872     // If the register allocator cannot place the address registers contiguously
3873     // without introducing moves, then using the non-sequential address encoding
3874     // is always preferable, since it saves VALU instructions and is usually a
3875     // wash in terms of code size or even better.
3876     //
3877     // However, we currently have no way of hinting to the register allocator
3878     // that MIMG addresses should be placed contiguously when it is possible to
3879     // do so, so force non-NSA for the common 2-address case as a heuristic.
3880     //
3881     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3882     // allocation when possible.
3883     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3884 
3885     if (!UseNSA && NumVAddrs > 1)
3886       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3887   }
3888 
3889   int Flags = 0;
3890   if (IsA16)
3891     Flags |= 1;
3892   if (IsG16)
3893     Flags |= 2;
3894   MI.addOperand(MachineOperand::CreateImm(Flags));
3895 
3896   if (BaseOpcode->Store) { // No TFE for stores?
3897     // TODO: Handle dmask trim
3898     Register VData = MI.getOperand(1).getReg();
3899     LLT Ty = MRI->getType(VData);
3900     if (!Ty.isVector() || Ty.getElementType() != S16)
3901       return true;
3902 
3903     Register RepackedReg = handleD16VData(B, *MRI, VData);
3904     if (RepackedReg != VData) {
3905       MI.getOperand(1).setReg(RepackedReg);
3906     }
3907 
3908     return true;
3909   }
3910 
3911   Register DstReg = MI.getOperand(0).getReg();
3912   LLT Ty = MRI->getType(DstReg);
3913   const LLT EltTy = Ty.getScalarType();
3914   const bool IsD16 = Ty.getScalarType() == S16;
3915   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3916 
3917   // Confirm that the return type is large enough for the dmask specified
3918   if (NumElts < DMaskLanes)
3919     return false;
3920 
3921   if (NumElts > 4 || DMaskLanes > 4)
3922     return false;
3923 
3924   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3925   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3926 
3927   // The raw dword aligned data component of the load. The only legal cases
3928   // where this matters should be when using the packed D16 format, for
3929   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3930   LLT RoundedTy;
3931 
3932   // S32 vector to to cover all data, plus TFE result element.
3933   LLT TFETy;
3934 
3935   // Register type to use for each loaded component. Will be S32 or V2S16.
3936   LLT RegTy;
3937 
3938   if (IsD16 && ST.hasUnpackedD16VMem()) {
3939     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3940     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3941     RegTy = S32;
3942   } else {
3943     unsigned EltSize = EltTy.getSizeInBits();
3944     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3945     unsigned RoundedSize = 32 * RoundedElts;
3946     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3947     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3948     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3949   }
3950 
3951   // The return type does not need adjustment.
3952   // TODO: Should we change s16 case to s32 or <2 x s16>?
3953   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3954     return true;
3955 
3956   Register Dst1Reg;
3957 
3958   // Insert after the instruction.
3959   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3960 
3961   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3962   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3963   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3964   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3965 
3966   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3967 
3968   MI.getOperand(0).setReg(NewResultReg);
3969 
3970   // In the IR, TFE is supposed to be used with a 2 element struct return
3971   // type. The intruction really returns these two values in one contiguous
3972   // register, with one additional dword beyond the loaded data. Rewrite the
3973   // return type to use a single register result.
3974 
3975   if (IsTFE) {
3976     Dst1Reg = MI.getOperand(1).getReg();
3977     if (MRI->getType(Dst1Reg) != S32)
3978       return false;
3979 
3980     // TODO: Make sure the TFE operand bit is set.
3981     MI.RemoveOperand(1);
3982 
3983     // Handle the easy case that requires no repack instructions.
3984     if (Ty == S32) {
3985       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3986       return true;
3987     }
3988   }
3989 
3990   // Now figure out how to copy the new result register back into the old
3991   // result.
3992   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3993 
3994   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3995 
3996   if (ResultNumRegs == 1) {
3997     assert(!IsTFE);
3998     ResultRegs[0] = NewResultReg;
3999   } else {
4000     // We have to repack into a new vector of some kind.
4001     for (int I = 0; I != NumDataRegs; ++I)
4002       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4003     B.buildUnmerge(ResultRegs, NewResultReg);
4004 
4005     // Drop the final TFE element to get the data part. The TFE result is
4006     // directly written to the right place already.
4007     if (IsTFE)
4008       ResultRegs.resize(NumDataRegs);
4009   }
4010 
4011   // For an s16 scalar result, we form an s32 result with a truncate regardless
4012   // of packed vs. unpacked.
4013   if (IsD16 && !Ty.isVector()) {
4014     B.buildTrunc(DstReg, ResultRegs[0]);
4015     return true;
4016   }
4017 
4018   // Avoid a build/concat_vector of 1 entry.
4019   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4020     B.buildBitcast(DstReg, ResultRegs[0]);
4021     return true;
4022   }
4023 
4024   assert(Ty.isVector());
4025 
4026   if (IsD16) {
4027     // For packed D16 results with TFE enabled, all the data components are
4028     // S32. Cast back to the expected type.
4029     //
4030     // TODO: We don't really need to use load s32 elements. We would only need one
4031     // cast for the TFE result if a multiple of v2s16 was used.
4032     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4033       for (Register &Reg : ResultRegs)
4034         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4035     } else if (ST.hasUnpackedD16VMem()) {
4036       for (Register &Reg : ResultRegs)
4037         Reg = B.buildTrunc(S16, Reg).getReg(0);
4038     }
4039   }
4040 
4041   auto padWithUndef = [&](LLT Ty, int NumElts) {
4042     if (NumElts == 0)
4043       return;
4044     Register Undef = B.buildUndef(Ty).getReg(0);
4045     for (int I = 0; I != NumElts; ++I)
4046       ResultRegs.push_back(Undef);
4047   };
4048 
4049   // Pad out any elements eliminated due to the dmask.
4050   LLT ResTy = MRI->getType(ResultRegs[0]);
4051   if (!ResTy.isVector()) {
4052     padWithUndef(ResTy, NumElts - ResultRegs.size());
4053     B.buildBuildVector(DstReg, ResultRegs);
4054     return true;
4055   }
4056 
4057   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4058   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4059 
4060   // Deal with the one annoying legal case.
4061   const LLT V3S16 = LLT::vector(3, 16);
4062   if (Ty == V3S16) {
4063     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4064     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4065     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4066     return true;
4067   }
4068 
4069   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4070   B.buildConcatVectors(DstReg, ResultRegs);
4071   return true;
4072 }
4073 
4074 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4075   MachineInstr &MI, MachineIRBuilder &B,
4076   GISelChangeObserver &Observer) const {
4077   Register Dst = MI.getOperand(0).getReg();
4078   LLT Ty = B.getMRI()->getType(Dst);
4079   unsigned Size = Ty.getSizeInBits();
4080   MachineFunction &MF = B.getMF();
4081 
4082   Observer.changingInstr(MI);
4083 
4084   // FIXME: We don't really need this intermediate instruction. The intrinsic
4085   // should be fixed to have a memory operand. Since it's readnone, we're not
4086   // allowed to add one.
4087   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4088   MI.RemoveOperand(1); // Remove intrinsic ID
4089 
4090   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4091   // TODO: Should this use datalayout alignment?
4092   const unsigned MemSize = (Size + 7) / 8;
4093   const Align MemAlign(4);
4094   MachineMemOperand *MMO = MF.getMachineMemOperand(
4095       MachinePointerInfo(),
4096       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4097           MachineMemOperand::MOInvariant,
4098       MemSize, MemAlign);
4099   MI.addMemOperand(MF, MMO);
4100 
4101   // There are no 96-bit result scalar loads, but widening to 128-bit should
4102   // always be legal. We may need to restore this to a 96-bit result if it turns
4103   // out this needs to be converted to a vector load during RegBankSelect.
4104   if (!isPowerOf2_32(Size)) {
4105     LegalizerHelper Helper(MF, *this, Observer, B);
4106 
4107     if (Ty.isVector())
4108       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4109     else
4110       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4111   }
4112 
4113   Observer.changedInstr(MI);
4114   return true;
4115 }
4116 
4117 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4118                                                 MachineRegisterInfo &MRI,
4119                                                 MachineIRBuilder &B) const {
4120   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4121   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4122       !ST.isTrapHandlerEnabled()) {
4123     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4124   } else {
4125     // Pass queue pointer to trap handler as input, and insert trap instruction
4126     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4127     const ArgDescriptor *Arg =
4128         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4129     if (!Arg)
4130       return false;
4131     MachineRegisterInfo &MRI = *B.getMRI();
4132     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4133     Register LiveIn = getLiveInRegister(
4134         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4135         /*InsertLiveInCopy=*/false);
4136     if (!loadInputValue(LiveIn, B, Arg))
4137       return false;
4138     B.buildCopy(SGPR01, LiveIn);
4139     B.buildInstr(AMDGPU::S_TRAP)
4140         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4141         .addReg(SGPR01, RegState::Implicit);
4142   }
4143 
4144   MI.eraseFromParent();
4145   return true;
4146 }
4147 
4148 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4149     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4150   // Is non-HSA path or trap-handler disabled? then, report a warning
4151   // accordingly
4152   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4153       !ST.isTrapHandlerEnabled()) {
4154     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4155                                      "debugtrap handler not supported",
4156                                      MI.getDebugLoc(), DS_Warning);
4157     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4158     Ctx.diagnose(NoTrap);
4159   } else {
4160     // Insert debug-trap instruction
4161     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4162   }
4163 
4164   MI.eraseFromParent();
4165   return true;
4166 }
4167 
4168 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4169                                             MachineInstr &MI) const {
4170   MachineIRBuilder &B = Helper.MIRBuilder;
4171   MachineRegisterInfo &MRI = *B.getMRI();
4172 
4173   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4174   auto IntrID = MI.getIntrinsicID();
4175   switch (IntrID) {
4176   case Intrinsic::amdgcn_if:
4177   case Intrinsic::amdgcn_else: {
4178     MachineInstr *Br = nullptr;
4179     MachineBasicBlock *UncondBrTarget = nullptr;
4180     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4181       const SIRegisterInfo *TRI
4182         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4183 
4184       Register Def = MI.getOperand(1).getReg();
4185       Register Use = MI.getOperand(3).getReg();
4186 
4187       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4188       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4189       if (IntrID == Intrinsic::amdgcn_if) {
4190         B.buildInstr(AMDGPU::SI_IF)
4191           .addDef(Def)
4192           .addUse(Use)
4193           .addMBB(UncondBrTarget);
4194       } else {
4195         B.buildInstr(AMDGPU::SI_ELSE)
4196           .addDef(Def)
4197           .addUse(Use)
4198           .addMBB(UncondBrTarget)
4199           .addImm(0);
4200       }
4201 
4202       if (Br) {
4203         Br->getOperand(0).setMBB(CondBrTarget);
4204       } else {
4205         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4206         // since we're swapping branch targets it needs to be reinserted.
4207         // FIXME: IRTranslator should probably not do this
4208         B.buildBr(*CondBrTarget);
4209       }
4210 
4211       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4212       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4213       MI.eraseFromParent();
4214       BrCond->eraseFromParent();
4215       return true;
4216     }
4217 
4218     return false;
4219   }
4220   case Intrinsic::amdgcn_loop: {
4221     MachineInstr *Br = nullptr;
4222     MachineBasicBlock *UncondBrTarget = nullptr;
4223     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4224       const SIRegisterInfo *TRI
4225         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4226 
4227       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4228       Register Reg = MI.getOperand(2).getReg();
4229 
4230       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4231       B.buildInstr(AMDGPU::SI_LOOP)
4232         .addUse(Reg)
4233         .addMBB(UncondBrTarget);
4234 
4235       if (Br)
4236         Br->getOperand(0).setMBB(CondBrTarget);
4237       else
4238         B.buildBr(*CondBrTarget);
4239 
4240       MI.eraseFromParent();
4241       BrCond->eraseFromParent();
4242       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4243       return true;
4244     }
4245 
4246     return false;
4247   }
4248   case Intrinsic::amdgcn_kernarg_segment_ptr:
4249     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4250       // This only makes sense to call in a kernel, so just lower to null.
4251       B.buildConstant(MI.getOperand(0).getReg(), 0);
4252       MI.eraseFromParent();
4253       return true;
4254     }
4255 
4256     return legalizePreloadedArgIntrin(
4257       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4258   case Intrinsic::amdgcn_implicitarg_ptr:
4259     return legalizeImplicitArgPtr(MI, MRI, B);
4260   case Intrinsic::amdgcn_workitem_id_x:
4261     return legalizePreloadedArgIntrin(MI, MRI, B,
4262                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4263   case Intrinsic::amdgcn_workitem_id_y:
4264     return legalizePreloadedArgIntrin(MI, MRI, B,
4265                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4266   case Intrinsic::amdgcn_workitem_id_z:
4267     return legalizePreloadedArgIntrin(MI, MRI, B,
4268                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4269   case Intrinsic::amdgcn_workgroup_id_x:
4270     return legalizePreloadedArgIntrin(MI, MRI, B,
4271                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4272   case Intrinsic::amdgcn_workgroup_id_y:
4273     return legalizePreloadedArgIntrin(MI, MRI, B,
4274                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4275   case Intrinsic::amdgcn_workgroup_id_z:
4276     return legalizePreloadedArgIntrin(MI, MRI, B,
4277                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4278   case Intrinsic::amdgcn_dispatch_ptr:
4279     return legalizePreloadedArgIntrin(MI, MRI, B,
4280                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4281   case Intrinsic::amdgcn_queue_ptr:
4282     return legalizePreloadedArgIntrin(MI, MRI, B,
4283                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4284   case Intrinsic::amdgcn_implicit_buffer_ptr:
4285     return legalizePreloadedArgIntrin(
4286       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4287   case Intrinsic::amdgcn_dispatch_id:
4288     return legalizePreloadedArgIntrin(MI, MRI, B,
4289                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4290   case Intrinsic::amdgcn_fdiv_fast:
4291     return legalizeFDIVFastIntrin(MI, MRI, B);
4292   case Intrinsic::amdgcn_is_shared:
4293     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4294   case Intrinsic::amdgcn_is_private:
4295     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4296   case Intrinsic::amdgcn_wavefrontsize: {
4297     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4298     MI.eraseFromParent();
4299     return true;
4300   }
4301   case Intrinsic::amdgcn_s_buffer_load:
4302     return legalizeSBufferLoad(MI, B, Helper.Observer);
4303   case Intrinsic::amdgcn_raw_buffer_store:
4304   case Intrinsic::amdgcn_struct_buffer_store:
4305     return legalizeBufferStore(MI, MRI, B, false, false);
4306   case Intrinsic::amdgcn_raw_buffer_store_format:
4307   case Intrinsic::amdgcn_struct_buffer_store_format:
4308     return legalizeBufferStore(MI, MRI, B, false, true);
4309   case Intrinsic::amdgcn_raw_tbuffer_store:
4310   case Intrinsic::amdgcn_struct_tbuffer_store:
4311     return legalizeBufferStore(MI, MRI, B, true, true);
4312   case Intrinsic::amdgcn_raw_buffer_load:
4313   case Intrinsic::amdgcn_struct_buffer_load:
4314     return legalizeBufferLoad(MI, MRI, B, false, false);
4315   case Intrinsic::amdgcn_raw_buffer_load_format:
4316   case Intrinsic::amdgcn_struct_buffer_load_format:
4317     return legalizeBufferLoad(MI, MRI, B, true, false);
4318   case Intrinsic::amdgcn_raw_tbuffer_load:
4319   case Intrinsic::amdgcn_struct_tbuffer_load:
4320     return legalizeBufferLoad(MI, MRI, B, true, true);
4321   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4322   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4323   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4324   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4325   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4326   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4327   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4328   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4329   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4330   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4331   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4332   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4333   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4334   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4335   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4336   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4337   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4338   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4339   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4340   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4341   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4342   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4343   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4344   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4345   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4346   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4347     return legalizeBufferAtomic(MI, B, IntrID);
4348   case Intrinsic::amdgcn_atomic_inc:
4349     return legalizeAtomicIncDec(MI, B, true);
4350   case Intrinsic::amdgcn_atomic_dec:
4351     return legalizeAtomicIncDec(MI, B, false);
4352   case Intrinsic::trap:
4353     return legalizeTrapIntrinsic(MI, MRI, B);
4354   case Intrinsic::debugtrap:
4355     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4356   default: {
4357     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4358             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4359       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4360     return true;
4361   }
4362   }
4363 
4364   return true;
4365 }
4366