1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
445     .customFor({S32, S64})
446     .clampScalar(0, S32, S64)
447     .widenScalarToNextPow2(0, 32)
448     .scalarize(0);
449 
450   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
451     .legalFor({S32})
452     .clampScalar(0, S32, S32)
453     .scalarize(0);
454 
455   // Report legal for any types we can handle anywhere. For the cases only legal
456   // on the SALU, RegBankSelect will be able to re-legalize.
457   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
458     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
459     .clampScalar(0, S32, S64)
460     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
461     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
462     .widenScalarToNextPow2(0)
463     .scalarize(0);
464 
465   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
466                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
467     .legalFor({{S32, S1}, {S32, S32}})
468     .minScalar(0, S32)
469     // TODO: .scalarize(0)
470     .lower();
471 
472   getActionDefinitionsBuilder(G_BITCAST)
473     // Don't worry about the size constraint.
474     .legalIf(all(isRegisterType(0), isRegisterType(1)))
475     .lower();
476 
477 
478   getActionDefinitionsBuilder(G_CONSTANT)
479     .legalFor({S1, S32, S64, S16, GlobalPtr,
480                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
481     .clampScalar(0, S32, S64)
482     .widenScalarToNextPow2(0)
483     .legalIf(isPointer(0));
484 
485   getActionDefinitionsBuilder(G_FCONSTANT)
486     .legalFor({S32, S64, S16})
487     .clampScalar(0, S16, S64);
488 
489   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
490       .legalIf(isRegisterType(0))
491       // s1 and s16 are special cases because they have legal operations on
492       // them, but don't really occupy registers in the normal way.
493       .legalFor({S1, S16})
494       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
495       .clampScalarOrElt(0, S32, MaxScalar)
496       .widenScalarToNextPow2(0, 32)
497       .clampMaxNumElements(0, S32, 16);
498 
499   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
500 
501   // If the amount is divergent, we have to do a wave reduction to get the
502   // maximum value, so this is expanded during RegBankSelect.
503   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
504     .legalFor({{PrivatePtr, S32}});
505 
506   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
507     .unsupportedFor({PrivatePtr})
508     .custom();
509   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
510 
511   auto &FPOpActions = getActionDefinitionsBuilder(
512     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
513     .legalFor({S32, S64});
514   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
515     .customFor({S32, S64});
516   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
517     .customFor({S32, S64});
518 
519   if (ST.has16BitInsts()) {
520     if (ST.hasVOP3PInsts())
521       FPOpActions.legalFor({S16, V2S16});
522     else
523       FPOpActions.legalFor({S16});
524 
525     TrigActions.customFor({S16});
526     FDIVActions.customFor({S16});
527   }
528 
529   auto &MinNumMaxNum = getActionDefinitionsBuilder({
530       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
531 
532   if (ST.hasVOP3PInsts()) {
533     MinNumMaxNum.customFor(FPTypesPK16)
534       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
535       .clampMaxNumElements(0, S16, 2)
536       .clampScalar(0, S16, S64)
537       .scalarize(0);
538   } else if (ST.has16BitInsts()) {
539     MinNumMaxNum.customFor(FPTypes16)
540       .clampScalar(0, S16, S64)
541       .scalarize(0);
542   } else {
543     MinNumMaxNum.customFor(FPTypesBase)
544       .clampScalar(0, S32, S64)
545       .scalarize(0);
546   }
547 
548   if (ST.hasVOP3PInsts())
549     FPOpActions.clampMaxNumElements(0, S16, 2);
550 
551   FPOpActions
552     .scalarize(0)
553     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
554 
555   TrigActions
556     .scalarize(0)
557     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
558 
559   FDIVActions
560     .scalarize(0)
561     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
562 
563   getActionDefinitionsBuilder({G_FNEG, G_FABS})
564     .legalFor(FPTypesPK16)
565     .clampMaxNumElements(0, S16, 2)
566     .scalarize(0)
567     .clampScalar(0, S16, S64);
568 
569   if (ST.has16BitInsts()) {
570     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
571       .legalFor({S32, S64, S16})
572       .scalarize(0)
573       .clampScalar(0, S16, S64);
574   } else {
575     getActionDefinitionsBuilder(G_FSQRT)
576       .legalFor({S32, S64})
577       .scalarize(0)
578       .clampScalar(0, S32, S64);
579 
580     if (ST.hasFractBug()) {
581       getActionDefinitionsBuilder(G_FFLOOR)
582         .customFor({S64})
583         .legalFor({S32, S64})
584         .scalarize(0)
585         .clampScalar(0, S32, S64);
586     } else {
587       getActionDefinitionsBuilder(G_FFLOOR)
588         .legalFor({S32, S64})
589         .scalarize(0)
590         .clampScalar(0, S32, S64);
591     }
592   }
593 
594   getActionDefinitionsBuilder(G_FPTRUNC)
595     .legalFor({{S32, S64}, {S16, S32}})
596     .scalarize(0)
597     .lower();
598 
599   getActionDefinitionsBuilder(G_FPEXT)
600     .legalFor({{S64, S32}, {S32, S16}})
601     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
602     .scalarize(0);
603 
604   getActionDefinitionsBuilder(G_FSUB)
605       // Use actual fsub instruction
606       .legalFor({S32})
607       // Must use fadd + fneg
608       .lowerFor({S64, S16, V2S16})
609       .scalarize(0)
610       .clampScalar(0, S32, S64);
611 
612   // Whether this is legal depends on the floating point mode for the function.
613   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
614   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
615     FMad.customFor({S32, S16});
616   else if (ST.hasMadMacF32Insts())
617     FMad.customFor({S32});
618   else if (ST.hasMadF16())
619     FMad.customFor({S16});
620   FMad.scalarize(0)
621       .lower();
622 
623   // TODO: Do we need to clamp maximum bitwidth?
624   getActionDefinitionsBuilder(G_TRUNC)
625     .legalIf(isScalar(0))
626     .legalFor({{V2S16, V2S32}})
627     .clampMaxNumElements(0, S16, 2)
628     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
629     // situations (like an invalid implicit use), we don't want to infinite loop
630     // in the legalizer.
631     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
632     .alwaysLegal();
633 
634   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
635     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
636                {S32, S1}, {S64, S1}, {S16, S1}})
637     .scalarize(0)
638     .clampScalar(0, S32, S64)
639     .widenScalarToNextPow2(1, 32);
640 
641   // TODO: Split s1->s64 during regbankselect for VALU.
642   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
643     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
644     .lowerFor({{S32, S64}})
645     .lowerIf(typeIs(1, S1))
646     .customFor({{S64, S64}});
647   if (ST.has16BitInsts())
648     IToFP.legalFor({{S16, S16}});
649   IToFP.clampScalar(1, S32, S64)
650        .minScalar(0, S32)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}})
657     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
658   if (ST.has16BitInsts())
659     FPToI.legalFor({{S16, S16}});
660   else
661     FPToI.minScalar(1, S32);
662 
663   FPToI.minScalar(0, S32)
664        .scalarize(0)
665        .lower();
666 
667   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
668     .scalarize(0)
669     .lower();
670 
671   if (ST.has16BitInsts()) {
672     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
673       .legalFor({S16, S32, S64})
674       .clampScalar(0, S16, S64)
675       .scalarize(0);
676   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
677     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
678       .legalFor({S32, S64})
679       .clampScalar(0, S32, S64)
680       .scalarize(0);
681   } else {
682     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
683       .legalFor({S32})
684       .customFor({S64})
685       .clampScalar(0, S32, S64)
686       .scalarize(0);
687   }
688 
689   // FIXME: Clamp offset operand.
690   getActionDefinitionsBuilder(G_PTR_ADD)
691     .legalIf(isPointer(0))
692     .scalarize(0);
693 
694   getActionDefinitionsBuilder(G_PTRMASK)
695     .legalIf(typeInSet(1, {S64, S32}))
696     .minScalar(1, S32)
697     .maxScalarIf(sizeIs(0, 32), 1, S32)
698     .maxScalarIf(sizeIs(0, 64), 1, S64)
699     .scalarize(0);
700 
701   auto &CmpBuilder =
702     getActionDefinitionsBuilder(G_ICMP)
703     // The compare output type differs based on the register bank of the output,
704     // so make both s1 and s32 legal.
705     //
706     // Scalar compares producing output in scc will be promoted to s32, as that
707     // is the allocatable register type that will be needed for the copy from
708     // scc. This will be promoted during RegBankSelect, and we assume something
709     // before that won't try to use s32 result types.
710     //
711     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
712     // bank.
713     .legalForCartesianProduct(
714       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
715     .legalForCartesianProduct(
716       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
717   if (ST.has16BitInsts()) {
718     CmpBuilder.legalFor({{S1, S16}});
719   }
720 
721   CmpBuilder
722     .widenScalarToNextPow2(1)
723     .clampScalar(1, S32, S64)
724     .scalarize(0)
725     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
726 
727   getActionDefinitionsBuilder(G_FCMP)
728     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
729     .widenScalarToNextPow2(1)
730     .clampScalar(1, S32, S64)
731     .scalarize(0);
732 
733   // FIXME: fpow has a selection pattern that should move to custom lowering.
734   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
735   if (ST.has16BitInsts())
736     Exp2Ops.legalFor({S32, S16});
737   else
738     Exp2Ops.legalFor({S32});
739   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
740   Exp2Ops.scalarize(0);
741 
742   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
743   if (ST.has16BitInsts())
744     ExpOps.customFor({{S32}, {S16}});
745   else
746     ExpOps.customFor({S32});
747   ExpOps.clampScalar(0, MinScalarFPTy, S32)
748         .scalarize(0);
749 
750   // The 64-bit versions produce 32-bit results, but only on the SALU.
751   getActionDefinitionsBuilder(G_CTPOP)
752     .legalFor({{S32, S32}, {S32, S64}})
753     .clampScalar(0, S32, S32)
754     .clampScalar(1, S32, S64)
755     .scalarize(0)
756     .widenScalarToNextPow2(0, 32)
757     .widenScalarToNextPow2(1, 32);
758 
759   // The hardware instructions return a different result on 0 than the generic
760   // instructions expect. The hardware produces -1, but these produce the
761   // bitwidth.
762   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
763     .scalarize(0)
764     .clampScalar(0, S32, S32)
765     .clampScalar(1, S32, S64)
766     .widenScalarToNextPow2(0, 32)
767     .widenScalarToNextPow2(1, 32)
768     .lower();
769 
770   // The 64-bit versions produce 32-bit results, but only on the SALU.
771   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
772     .legalFor({{S32, S32}, {S32, S64}})
773     .clampScalar(0, S32, S32)
774     .clampScalar(1, S32, S64)
775     .scalarize(0)
776     .widenScalarToNextPow2(0, 32)
777     .widenScalarToNextPow2(1, 32);
778 
779   getActionDefinitionsBuilder(G_BITREVERSE)
780     .legalFor({S32})
781     .clampScalar(0, S32, S32)
782     .scalarize(0);
783 
784   if (ST.has16BitInsts()) {
785     getActionDefinitionsBuilder(G_BSWAP)
786       .legalFor({S16, S32, V2S16})
787       .clampMaxNumElements(0, S16, 2)
788       // FIXME: Fixing non-power-of-2 before clamp is workaround for
789       // narrowScalar limitation.
790       .widenScalarToNextPow2(0)
791       .clampScalar(0, S16, S32)
792       .scalarize(0);
793 
794     if (ST.hasVOP3PInsts()) {
795       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
796         .legalFor({S32, S16, V2S16})
797         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
798         .clampMaxNumElements(0, S16, 2)
799         .minScalar(0, S16)
800         .widenScalarToNextPow2(0)
801         .scalarize(0)
802         .lower();
803     } else {
804       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
805         .legalFor({S32, S16})
806         .widenScalarToNextPow2(0)
807         .minScalar(0, S16)
808         .scalarize(0)
809         .lower();
810     }
811   } else {
812     // TODO: Should have same legality without v_perm_b32
813     getActionDefinitionsBuilder(G_BSWAP)
814       .legalFor({S32})
815       .lowerIf(scalarNarrowerThan(0, 32))
816       // FIXME: Fixing non-power-of-2 before clamp is workaround for
817       // narrowScalar limitation.
818       .widenScalarToNextPow2(0)
819       .maxScalar(0, S32)
820       .scalarize(0)
821       .lower();
822 
823     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
824       .legalFor({S32})
825       .minScalar(0, S32)
826       .widenScalarToNextPow2(0)
827       .scalarize(0)
828       .lower();
829   }
830 
831   getActionDefinitionsBuilder(G_INTTOPTR)
832     // List the common cases
833     .legalForCartesianProduct(AddrSpaces64, {S64})
834     .legalForCartesianProduct(AddrSpaces32, {S32})
835     .scalarize(0)
836     // Accept any address space as long as the size matches
837     .legalIf(sameSize(0, 1))
838     .widenScalarIf(smallerThan(1, 0),
839       [](const LegalityQuery &Query) {
840         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
841       })
842     .narrowScalarIf(largerThan(1, 0),
843       [](const LegalityQuery &Query) {
844         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
845       });
846 
847   getActionDefinitionsBuilder(G_PTRTOINT)
848     // List the common cases
849     .legalForCartesianProduct(AddrSpaces64, {S64})
850     .legalForCartesianProduct(AddrSpaces32, {S32})
851     .scalarize(0)
852     // Accept any address space as long as the size matches
853     .legalIf(sameSize(0, 1))
854     .widenScalarIf(smallerThan(0, 1),
855       [](const LegalityQuery &Query) {
856         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
857       })
858     .narrowScalarIf(
859       largerThan(0, 1),
860       [](const LegalityQuery &Query) {
861         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
862       });
863 
864   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
865     .scalarize(0)
866     .custom();
867 
868   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
869                                     bool IsLoad) -> bool {
870     const LLT DstTy = Query.Types[0];
871 
872     // Split vector extloads.
873     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
874     unsigned Align = Query.MMODescrs[0].AlignInBits;
875 
876     if (MemSize < DstTy.getSizeInBits())
877       MemSize = std::max(MemSize, Align);
878 
879     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
880       return true;
881 
882     const LLT PtrTy = Query.Types[1];
883     unsigned AS = PtrTy.getAddressSpace();
884     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
885       return true;
886 
887     // Catch weird sized loads that don't evenly divide into the access sizes
888     // TODO: May be able to widen depending on alignment etc.
889     unsigned NumRegs = (MemSize + 31) / 32;
890     if (NumRegs == 3) {
891       if (!ST.hasDwordx3LoadStores())
892         return true;
893     } else {
894       // If the alignment allows, these should have been widened.
895       if (!isPowerOf2_32(NumRegs))
896         return true;
897     }
898 
899     if (Align < MemSize) {
900       const SITargetLowering *TLI = ST.getTargetLowering();
901       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
902     }
903 
904     return false;
905   };
906 
907   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
908                                          unsigned Opc) -> bool {
909     unsigned Size = Query.Types[0].getSizeInBits();
910     if (isPowerOf2_32(Size))
911       return false;
912 
913     if (Size == 96 && ST.hasDwordx3LoadStores())
914       return false;
915 
916     unsigned AddrSpace = Query.Types[1].getAddressSpace();
917     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
918       return false;
919 
920     unsigned Align = Query.MMODescrs[0].AlignInBits;
921     unsigned RoundedSize = NextPowerOf2(Size);
922     return (Align >= RoundedSize);
923   };
924 
925   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
926   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
927   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
928 
929   // TODO: Refine based on subtargets which support unaligned access or 128-bit
930   // LDS
931   // TODO: Unsupported flat for SI.
932 
933   for (unsigned Op : {G_LOAD, G_STORE}) {
934     const bool IsStore = Op == G_STORE;
935 
936     auto &Actions = getActionDefinitionsBuilder(Op);
937     // Explicitly list some common cases.
938     // TODO: Does this help compile time at all?
939     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
940                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
941                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
942                                       {S64, GlobalPtr, 64, GlobalAlign32},
943                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
944                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
945                                       {S32, GlobalPtr, 8, GlobalAlign8},
946                                       {S32, GlobalPtr, 16, GlobalAlign16},
947 
948                                       {S32, LocalPtr, 32, 32},
949                                       {S64, LocalPtr, 64, 32},
950                                       {V2S32, LocalPtr, 64, 32},
951                                       {S32, LocalPtr, 8, 8},
952                                       {S32, LocalPtr, 16, 16},
953                                       {V2S16, LocalPtr, 32, 32},
954 
955                                       {S32, PrivatePtr, 32, 32},
956                                       {S32, PrivatePtr, 8, 8},
957                                       {S32, PrivatePtr, 16, 16},
958                                       {V2S16, PrivatePtr, 32, 32},
959 
960                                       {S32, ConstantPtr, 32, GlobalAlign32},
961                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
962                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
963                                       {S64, ConstantPtr, 64, GlobalAlign32},
964                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
965     Actions.legalIf(
966       [=](const LegalityQuery &Query) -> bool {
967         return isLoadStoreLegal(ST, Query, Op);
968       });
969 
970     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
971     // 64-bits.
972     //
973     // TODO: Should generalize bitcast action into coerce, which will also cover
974     // inserting addrspacecasts.
975     Actions.customIf(typeIs(1, Constant32Ptr));
976 
977     // Turn any illegal element vectors into something easier to deal
978     // with. These will ultimately produce 32-bit scalar shifts to extract the
979     // parts anyway.
980     //
981     // For odd 16-bit element vectors, prefer to split those into pieces with
982     // 16-bit vector parts.
983     Actions.bitcastIf(
984       [=](const LegalityQuery &Query) -> bool {
985         const LLT Ty = Query.Types[0];
986         const unsigned Size = Ty.getSizeInBits();
987 
988         if (Size != Query.MMODescrs[0].SizeInBits)
989           return Size <= 32 && Ty.isVector();
990 
991         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
992           return true;
993         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
994                !isRegisterVectorElementType(Ty.getElementType());
995       }, bitcastToRegisterType(0));
996 
997     Actions
998         .customIf(typeIs(1, Constant32Ptr))
999         // Widen suitably aligned loads by loading extra elements.
1000         .moreElementsIf([=](const LegalityQuery &Query) {
1001             const LLT Ty = Query.Types[0];
1002             return Op == G_LOAD && Ty.isVector() &&
1003                    shouldWidenLoadResult(Query, Op);
1004           }, moreElementsToNextPow2(0))
1005         .widenScalarIf([=](const LegalityQuery &Query) {
1006             const LLT Ty = Query.Types[0];
1007             return Op == G_LOAD && !Ty.isVector() &&
1008                    shouldWidenLoadResult(Query, Op);
1009           }, widenScalarOrEltToNextPow2(0))
1010         .narrowScalarIf(
1011             [=](const LegalityQuery &Query) -> bool {
1012               return !Query.Types[0].isVector() &&
1013                      needToSplitMemOp(Query, Op == G_LOAD);
1014             },
1015             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1016               const LLT DstTy = Query.Types[0];
1017               const LLT PtrTy = Query.Types[1];
1018 
1019               const unsigned DstSize = DstTy.getSizeInBits();
1020               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1021 
1022               // Split extloads.
1023               if (DstSize > MemSize)
1024                 return std::make_pair(0, LLT::scalar(MemSize));
1025 
1026               if (!isPowerOf2_32(DstSize)) {
1027                 // We're probably decomposing an odd sized store. Try to split
1028                 // to the widest type. TODO: Account for alignment. As-is it
1029                 // should be OK, since the new parts will be further legalized.
1030                 unsigned FloorSize = PowerOf2Floor(DstSize);
1031                 return std::make_pair(0, LLT::scalar(FloorSize));
1032               }
1033 
1034               if (DstSize > 32 && (DstSize % 32 != 0)) {
1035                 // FIXME: Need a way to specify non-extload of larger size if
1036                 // suitably aligned.
1037                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1038               }
1039 
1040               unsigned MaxSize = maxSizeForAddrSpace(ST,
1041                                                      PtrTy.getAddressSpace(),
1042                                                      Op == G_LOAD);
1043               if (MemSize > MaxSize)
1044                 return std::make_pair(0, LLT::scalar(MaxSize));
1045 
1046               unsigned Align = Query.MMODescrs[0].AlignInBits;
1047               return std::make_pair(0, LLT::scalar(Align));
1048             })
1049         .fewerElementsIf(
1050             [=](const LegalityQuery &Query) -> bool {
1051               return Query.Types[0].isVector() &&
1052                      needToSplitMemOp(Query, Op == G_LOAD);
1053             },
1054             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1055               const LLT DstTy = Query.Types[0];
1056               const LLT PtrTy = Query.Types[1];
1057 
1058               LLT EltTy = DstTy.getElementType();
1059               unsigned MaxSize = maxSizeForAddrSpace(ST,
1060                                                      PtrTy.getAddressSpace(),
1061                                                      Op == G_LOAD);
1062 
1063               // FIXME: Handle widened to power of 2 results better. This ends
1064               // up scalarizing.
1065               // FIXME: 3 element stores scalarized on SI
1066 
1067               // Split if it's too large for the address space.
1068               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1069                 unsigned NumElts = DstTy.getNumElements();
1070                 unsigned EltSize = EltTy.getSizeInBits();
1071 
1072                 if (MaxSize % EltSize == 0) {
1073                   return std::make_pair(
1074                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1075                 }
1076 
1077                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1078 
1079                 // FIXME: Refine when odd breakdowns handled
1080                 // The scalars will need to be re-legalized.
1081                 if (NumPieces == 1 || NumPieces >= NumElts ||
1082                     NumElts % NumPieces != 0)
1083                   return std::make_pair(0, EltTy);
1084 
1085                 return std::make_pair(0,
1086                                       LLT::vector(NumElts / NumPieces, EltTy));
1087               }
1088 
1089               // FIXME: We could probably handle weird extending loads better.
1090               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1091               if (DstTy.getSizeInBits() > MemSize)
1092                 return std::make_pair(0, EltTy);
1093 
1094               unsigned EltSize = EltTy.getSizeInBits();
1095               unsigned DstSize = DstTy.getSizeInBits();
1096               if (!isPowerOf2_32(DstSize)) {
1097                 // We're probably decomposing an odd sized store. Try to split
1098                 // to the widest type. TODO: Account for alignment. As-is it
1099                 // should be OK, since the new parts will be further legalized.
1100                 unsigned FloorSize = PowerOf2Floor(DstSize);
1101                 return std::make_pair(
1102                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1103               }
1104 
1105               // Need to split because of alignment.
1106               unsigned Align = Query.MMODescrs[0].AlignInBits;
1107               if (EltSize > Align &&
1108                   (EltSize / Align < DstTy.getNumElements())) {
1109                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1110               }
1111 
1112               // May need relegalization for the scalars.
1113               return std::make_pair(0, EltTy);
1114             })
1115         .minScalar(0, S32);
1116 
1117     if (IsStore)
1118       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1119 
1120     // TODO: Need a bitcast lower option?
1121     Actions
1122         .widenScalarToNextPow2(0)
1123         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1124   }
1125 
1126   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1127                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1128                                                   {S32, GlobalPtr, 16, 2 * 8},
1129                                                   {S32, LocalPtr, 8, 8},
1130                                                   {S32, LocalPtr, 16, 16},
1131                                                   {S32, PrivatePtr, 8, 8},
1132                                                   {S32, PrivatePtr, 16, 16},
1133                                                   {S32, ConstantPtr, 8, 8},
1134                                                   {S32, ConstantPtr, 16, 2 * 8}});
1135   if (ST.hasFlatAddressSpace()) {
1136     ExtLoads.legalForTypesWithMemDesc(
1137         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1138   }
1139 
1140   ExtLoads.clampScalar(0, S32, S32)
1141           .widenScalarToNextPow2(0)
1142           .unsupportedIfMemSizeNotPow2()
1143           .lower();
1144 
1145   auto &Atomics = getActionDefinitionsBuilder(
1146     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1147      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1148      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1149      G_ATOMICRMW_UMIN})
1150     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1151                {S64, GlobalPtr}, {S64, LocalPtr}});
1152   if (ST.hasFlatAddressSpace()) {
1153     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1154   }
1155 
1156   if (ST.hasLDSFPAtomics()) {
1157     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1158       .legalFor({{S32, LocalPtr}});
1159   }
1160 
1161   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1162   // demarshalling
1163   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1164     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1165                 {S32, FlatPtr}, {S64, FlatPtr}})
1166     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1167                {S32, RegionPtr}, {S64, RegionPtr}});
1168   // TODO: Pointer types, any 32-bit or 64-bit vector
1169 
1170   // Condition should be s32 for scalar, s1 for vector.
1171   getActionDefinitionsBuilder(G_SELECT)
1172     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1173           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1174           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1175     .clampScalar(0, S16, S64)
1176     .scalarize(1)
1177     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1178     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1179     .clampMaxNumElements(0, S32, 2)
1180     .clampMaxNumElements(0, LocalPtr, 2)
1181     .clampMaxNumElements(0, PrivatePtr, 2)
1182     .scalarize(0)
1183     .widenScalarToNextPow2(0)
1184     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1185 
1186   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1187   // be more flexible with the shift amount type.
1188   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1189     .legalFor({{S32, S32}, {S64, S32}});
1190   if (ST.has16BitInsts()) {
1191     if (ST.hasVOP3PInsts()) {
1192       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1193             .clampMaxNumElements(0, S16, 2);
1194     } else
1195       Shifts.legalFor({{S16, S16}});
1196 
1197     // TODO: Support 16-bit shift amounts for all types
1198     Shifts.widenScalarIf(
1199       [=](const LegalityQuery &Query) {
1200         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1201         // 32-bit amount.
1202         const LLT ValTy = Query.Types[0];
1203         const LLT AmountTy = Query.Types[1];
1204         return ValTy.getSizeInBits() <= 16 &&
1205                AmountTy.getSizeInBits() < 16;
1206       }, changeTo(1, S16));
1207     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1208     Shifts.clampScalar(1, S32, S32);
1209     Shifts.clampScalar(0, S16, S64);
1210     Shifts.widenScalarToNextPow2(0, 16);
1211   } else {
1212     // Make sure we legalize the shift amount type first, as the general
1213     // expansion for the shifted type will produce much worse code if it hasn't
1214     // been truncated already.
1215     Shifts.clampScalar(1, S32, S32);
1216     Shifts.clampScalar(0, S32, S64);
1217     Shifts.widenScalarToNextPow2(0, 32);
1218   }
1219   Shifts.scalarize(0);
1220 
1221   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1222     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1223     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1224     unsigned IdxTypeIdx = 2;
1225 
1226     getActionDefinitionsBuilder(Op)
1227       .customIf([=](const LegalityQuery &Query) {
1228           const LLT EltTy = Query.Types[EltTypeIdx];
1229           const LLT VecTy = Query.Types[VecTypeIdx];
1230           const LLT IdxTy = Query.Types[IdxTypeIdx];
1231           return (EltTy.getSizeInBits() == 16 ||
1232                   EltTy.getSizeInBits() % 32 == 0) &&
1233                  VecTy.getSizeInBits() % 32 == 0 &&
1234                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1235                  IdxTy.getSizeInBits() == 32;
1236         })
1237       .clampScalar(EltTypeIdx, S32, S64)
1238       .clampScalar(VecTypeIdx, S32, S64)
1239       .clampScalar(IdxTypeIdx, S32, S32);
1240   }
1241 
1242   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1243     .unsupportedIf([=](const LegalityQuery &Query) {
1244         const LLT &EltTy = Query.Types[1].getElementType();
1245         return Query.Types[0] != EltTy;
1246       });
1247 
1248   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1249     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1250     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1251 
1252     // FIXME: Doesn't handle extract of illegal sizes.
1253     getActionDefinitionsBuilder(Op)
1254       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1255       // FIXME: Multiples of 16 should not be legal.
1256       .legalIf([=](const LegalityQuery &Query) {
1257           const LLT BigTy = Query.Types[BigTyIdx];
1258           const LLT LitTy = Query.Types[LitTyIdx];
1259           return (BigTy.getSizeInBits() % 32 == 0) &&
1260                  (LitTy.getSizeInBits() % 16 == 0);
1261         })
1262       .widenScalarIf(
1263         [=](const LegalityQuery &Query) {
1264           const LLT BigTy = Query.Types[BigTyIdx];
1265           return (BigTy.getScalarSizeInBits() < 16);
1266         },
1267         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1268       .widenScalarIf(
1269         [=](const LegalityQuery &Query) {
1270           const LLT LitTy = Query.Types[LitTyIdx];
1271           return (LitTy.getScalarSizeInBits() < 16);
1272         },
1273         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1274       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1275       .widenScalarToNextPow2(BigTyIdx, 32);
1276 
1277   }
1278 
1279   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1280     .legalForCartesianProduct(AllS32Vectors, {S32})
1281     .legalForCartesianProduct(AllS64Vectors, {S64})
1282     .clampNumElements(0, V16S32, V32S32)
1283     .clampNumElements(0, V2S64, V16S64)
1284     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1285 
1286   if (ST.hasScalarPackInsts()) {
1287     BuildVector
1288       // FIXME: Should probably widen s1 vectors straight to s32
1289       .minScalarOrElt(0, S16)
1290       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1291       .minScalar(1, S32);
1292 
1293     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1294       .legalFor({V2S16, S32})
1295       .lower();
1296     BuildVector.minScalarOrElt(0, S32);
1297   } else {
1298     BuildVector.customFor({V2S16, S16});
1299     BuildVector.minScalarOrElt(0, S32);
1300 
1301     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1302       .customFor({V2S16, S32})
1303       .lower();
1304   }
1305 
1306   BuildVector.legalIf(isRegisterType(0));
1307 
1308   // FIXME: Clamp maximum size
1309   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1310     .legalIf(isRegisterType(0));
1311 
1312   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1313   // pre-legalize.
1314   if (ST.hasVOP3PInsts()) {
1315     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1316       .customFor({V2S16, V2S16})
1317       .lower();
1318   } else
1319     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1320 
1321   // Merge/Unmerge
1322   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1323     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1324     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1325 
1326     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1327       const LLT Ty = Query.Types[TypeIdx];
1328       if (Ty.isVector()) {
1329         const LLT &EltTy = Ty.getElementType();
1330         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1331           return true;
1332         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1333           return true;
1334       }
1335       return false;
1336     };
1337 
1338     auto &Builder = getActionDefinitionsBuilder(Op)
1339       .lowerFor({{S16, V2S16}})
1340       .lowerIf([=](const LegalityQuery &Query) {
1341           const LLT BigTy = Query.Types[BigTyIdx];
1342           return BigTy.getSizeInBits() == 32;
1343         })
1344       // Try to widen to s16 first for small types.
1345       // TODO: Only do this on targets with legal s16 shifts
1346       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1347       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1348       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1349       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1350                            elementTypeIs(1, S16)),
1351                        changeTo(1, V2S16))
1352       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1353       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1354       // valid.
1355       .clampScalar(LitTyIdx, S32, S512)
1356       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1357       // Break up vectors with weird elements into scalars
1358       .fewerElementsIf(
1359         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1360         scalarize(0))
1361       .fewerElementsIf(
1362         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1363         scalarize(1))
1364       .clampScalar(BigTyIdx, S32, MaxScalar);
1365 
1366     if (Op == G_MERGE_VALUES) {
1367       Builder.widenScalarIf(
1368         // TODO: Use 16-bit shifts if legal for 8-bit values?
1369         [=](const LegalityQuery &Query) {
1370           const LLT Ty = Query.Types[LitTyIdx];
1371           return Ty.getSizeInBits() < 32;
1372         },
1373         changeTo(LitTyIdx, S32));
1374     }
1375 
1376     Builder.widenScalarIf(
1377       [=](const LegalityQuery &Query) {
1378         const LLT Ty = Query.Types[BigTyIdx];
1379         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1380           Ty.getSizeInBits() % 16 != 0;
1381       },
1382       [=](const LegalityQuery &Query) {
1383         // Pick the next power of 2, or a multiple of 64 over 128.
1384         // Whichever is smaller.
1385         const LLT &Ty = Query.Types[BigTyIdx];
1386         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1387         if (NewSizeInBits >= 256) {
1388           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1389           if (RoundedTo < NewSizeInBits)
1390             NewSizeInBits = RoundedTo;
1391         }
1392         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1393       })
1394       .legalIf([=](const LegalityQuery &Query) {
1395           const LLT &BigTy = Query.Types[BigTyIdx];
1396           const LLT &LitTy = Query.Types[LitTyIdx];
1397 
1398           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1399             return false;
1400           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1401             return false;
1402 
1403           return BigTy.getSizeInBits() % 16 == 0 &&
1404                  LitTy.getSizeInBits() % 16 == 0 &&
1405                  BigTy.getSizeInBits() <= MaxRegisterSize;
1406         })
1407       // Any vectors left are the wrong size. Scalarize them.
1408       .scalarize(0)
1409       .scalarize(1);
1410   }
1411 
1412   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1413   // RegBankSelect.
1414   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1415     .legalFor({{S32}, {S64}});
1416 
1417   if (ST.hasVOP3PInsts()) {
1418     SextInReg.lowerFor({{V2S16}})
1419       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1420       // get more vector shift opportunities, since we'll get those when
1421       // expanded.
1422       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1423   } else if (ST.has16BitInsts()) {
1424     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1425   } else {
1426     // Prefer to promote to s32 before lowering if we don't have 16-bit
1427     // shifts. This avoid a lot of intermediate truncate and extend operations.
1428     SextInReg.lowerFor({{S32}, {S64}});
1429   }
1430 
1431   // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1432   // available, and is selectively legal for s16, s32, v2s16.
1433   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1434     .scalarize(0)
1435     .clampScalar(0, S16, S32);
1436 
1437   SextInReg
1438     .scalarize(0)
1439     .clampScalar(0, S32, S64)
1440     .lower();
1441 
1442   getActionDefinitionsBuilder(G_FSHR)
1443     .legalFor({{S32, S32}})
1444     .scalarize(0)
1445     .lower();
1446 
1447   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1448     .legalFor({S64});
1449 
1450   getActionDefinitionsBuilder({
1451       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1452       G_FCOPYSIGN,
1453 
1454       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1455       G_READ_REGISTER,
1456       G_WRITE_REGISTER,
1457 
1458       G_SADDO, G_SSUBO,
1459 
1460        // TODO: Implement
1461       G_FMINIMUM, G_FMAXIMUM,
1462       G_FSHL
1463     }).lower();
1464 
1465   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1466         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1467         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1468     .unsupported();
1469 
1470   computeTables();
1471   verify(*ST.getInstrInfo());
1472 }
1473 
1474 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1475                                          MachineInstr &MI) const {
1476   MachineIRBuilder &B = Helper.MIRBuilder;
1477   MachineRegisterInfo &MRI = *B.getMRI();
1478   GISelChangeObserver &Observer = Helper.Observer;
1479 
1480   switch (MI.getOpcode()) {
1481   case TargetOpcode::G_ADDRSPACE_CAST:
1482     return legalizeAddrSpaceCast(MI, MRI, B);
1483   case TargetOpcode::G_FRINT:
1484     return legalizeFrint(MI, MRI, B);
1485   case TargetOpcode::G_FCEIL:
1486     return legalizeFceil(MI, MRI, B);
1487   case TargetOpcode::G_INTRINSIC_TRUNC:
1488     return legalizeIntrinsicTrunc(MI, MRI, B);
1489   case TargetOpcode::G_SITOFP:
1490     return legalizeITOFP(MI, MRI, B, true);
1491   case TargetOpcode::G_UITOFP:
1492     return legalizeITOFP(MI, MRI, B, false);
1493   case TargetOpcode::G_FPTOSI:
1494     return legalizeFPTOI(MI, MRI, B, true);
1495   case TargetOpcode::G_FPTOUI:
1496     return legalizeFPTOI(MI, MRI, B, false);
1497   case TargetOpcode::G_FMINNUM:
1498   case TargetOpcode::G_FMAXNUM:
1499   case TargetOpcode::G_FMINNUM_IEEE:
1500   case TargetOpcode::G_FMAXNUM_IEEE:
1501     return legalizeMinNumMaxNum(Helper, MI);
1502   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1503     return legalizeExtractVectorElt(MI, MRI, B);
1504   case TargetOpcode::G_INSERT_VECTOR_ELT:
1505     return legalizeInsertVectorElt(MI, MRI, B);
1506   case TargetOpcode::G_SHUFFLE_VECTOR:
1507     return legalizeShuffleVector(MI, MRI, B);
1508   case TargetOpcode::G_FSIN:
1509   case TargetOpcode::G_FCOS:
1510     return legalizeSinCos(MI, MRI, B);
1511   case TargetOpcode::G_GLOBAL_VALUE:
1512     return legalizeGlobalValue(MI, MRI, B);
1513   case TargetOpcode::G_LOAD:
1514     return legalizeLoad(MI, MRI, B, Observer);
1515   case TargetOpcode::G_FMAD:
1516     return legalizeFMad(MI, MRI, B);
1517   case TargetOpcode::G_FDIV:
1518     return legalizeFDIV(MI, MRI, B);
1519   case TargetOpcode::G_UDIV:
1520   case TargetOpcode::G_UREM:
1521     return legalizeUDIV_UREM(MI, MRI, B);
1522   case TargetOpcode::G_SDIV:
1523   case TargetOpcode::G_SREM:
1524     return legalizeSDIV_SREM(MI, MRI, B);
1525   case TargetOpcode::G_ATOMIC_CMPXCHG:
1526     return legalizeAtomicCmpXChg(MI, MRI, B);
1527   case TargetOpcode::G_FLOG:
1528     return legalizeFlog(MI, B, numbers::ln2f);
1529   case TargetOpcode::G_FLOG10:
1530     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1531   case TargetOpcode::G_FEXP:
1532     return legalizeFExp(MI, B);
1533   case TargetOpcode::G_FPOW:
1534     return legalizeFPow(MI, B);
1535   case TargetOpcode::G_FFLOOR:
1536     return legalizeFFloor(MI, MRI, B);
1537   case TargetOpcode::G_BUILD_VECTOR:
1538     return legalizeBuildVector(MI, MRI, B);
1539   default:
1540     return false;
1541   }
1542 
1543   llvm_unreachable("expected switch to return");
1544 }
1545 
1546 Register AMDGPULegalizerInfo::getSegmentAperture(
1547   unsigned AS,
1548   MachineRegisterInfo &MRI,
1549   MachineIRBuilder &B) const {
1550   MachineFunction &MF = B.getMF();
1551   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1552   const LLT S32 = LLT::scalar(32);
1553 
1554   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1555 
1556   if (ST.hasApertureRegs()) {
1557     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1558     // getreg.
1559     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1560         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1561         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1562     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1563         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1564         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1565     unsigned Encoding =
1566         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1567         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1568         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1569 
1570     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1571 
1572     B.buildInstr(AMDGPU::S_GETREG_B32)
1573       .addDef(GetReg)
1574       .addImm(Encoding);
1575     MRI.setType(GetReg, S32);
1576 
1577     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1578     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1579   }
1580 
1581   Register QueuePtr = MRI.createGenericVirtualRegister(
1582     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1583 
1584   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1585   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1586     return Register();
1587 
1588   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1589   // private_segment_aperture_base_hi.
1590   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1591 
1592   // TODO: can we be smarter about machine pointer info?
1593   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1594   MachineMemOperand *MMO = MF.getMachineMemOperand(
1595       PtrInfo,
1596       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1597           MachineMemOperand::MOInvariant,
1598       4, commonAlignment(Align(64), StructOffset));
1599 
1600   Register LoadAddr;
1601 
1602   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1603   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1604 }
1605 
1606 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1607   MachineInstr &MI, MachineRegisterInfo &MRI,
1608   MachineIRBuilder &B) const {
1609   MachineFunction &MF = B.getMF();
1610 
1611   const LLT S32 = LLT::scalar(32);
1612   Register Dst = MI.getOperand(0).getReg();
1613   Register Src = MI.getOperand(1).getReg();
1614 
1615   LLT DstTy = MRI.getType(Dst);
1616   LLT SrcTy = MRI.getType(Src);
1617   unsigned DestAS = DstTy.getAddressSpace();
1618   unsigned SrcAS = SrcTy.getAddressSpace();
1619 
1620   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1621   // vector element.
1622   assert(!DstTy.isVector());
1623 
1624   const AMDGPUTargetMachine &TM
1625     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1626 
1627   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1628   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1629     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1630     return true;
1631   }
1632 
1633   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1634     // Truncate.
1635     B.buildExtract(Dst, Src, 0);
1636     MI.eraseFromParent();
1637     return true;
1638   }
1639 
1640   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1641     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1642     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1643 
1644     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1645     // another. Merge operands are required to be the same type, but creating an
1646     // extra ptrtoint would be kind of pointless.
1647     auto HighAddr = B.buildConstant(
1648       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1649     B.buildMerge(Dst, {Src, HighAddr});
1650     MI.eraseFromParent();
1651     return true;
1652   }
1653 
1654   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1655     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1656            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1657     unsigned NullVal = TM.getNullPointerValue(DestAS);
1658 
1659     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1660     auto FlatNull = B.buildConstant(SrcTy, 0);
1661 
1662     // Extract low 32-bits of the pointer.
1663     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1664 
1665     auto CmpRes =
1666         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1667     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1668 
1669     MI.eraseFromParent();
1670     return true;
1671   }
1672 
1673   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1674     return false;
1675 
1676   if (!ST.hasFlatAddressSpace())
1677     return false;
1678 
1679   auto SegmentNull =
1680       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1681   auto FlatNull =
1682       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1683 
1684   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1685   if (!ApertureReg.isValid())
1686     return false;
1687 
1688   auto CmpRes =
1689       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1690 
1691   // Coerce the type of the low half of the result so we can use merge_values.
1692   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1693 
1694   // TODO: Should we allow mismatched types but matching sizes in merges to
1695   // avoid the ptrtoint?
1696   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1697   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1698 
1699   MI.eraseFromParent();
1700   return true;
1701 }
1702 
1703 bool AMDGPULegalizerInfo::legalizeFrint(
1704   MachineInstr &MI, MachineRegisterInfo &MRI,
1705   MachineIRBuilder &B) const {
1706   Register Src = MI.getOperand(1).getReg();
1707   LLT Ty = MRI.getType(Src);
1708   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1709 
1710   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1711   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1712 
1713   auto C1 = B.buildFConstant(Ty, C1Val);
1714   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1715 
1716   // TODO: Should this propagate fast-math-flags?
1717   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1718   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1719 
1720   auto C2 = B.buildFConstant(Ty, C2Val);
1721   auto Fabs = B.buildFAbs(Ty, Src);
1722 
1723   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1724   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1725   return true;
1726 }
1727 
1728 bool AMDGPULegalizerInfo::legalizeFceil(
1729   MachineInstr &MI, MachineRegisterInfo &MRI,
1730   MachineIRBuilder &B) const {
1731 
1732   const LLT S1 = LLT::scalar(1);
1733   const LLT S64 = LLT::scalar(64);
1734 
1735   Register Src = MI.getOperand(1).getReg();
1736   assert(MRI.getType(Src) == S64);
1737 
1738   // result = trunc(src)
1739   // if (src > 0.0 && src != result)
1740   //   result += 1.0
1741 
1742   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1743 
1744   const auto Zero = B.buildFConstant(S64, 0.0);
1745   const auto One = B.buildFConstant(S64, 1.0);
1746   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1747   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1748   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1749   auto Add = B.buildSelect(S64, And, One, Zero);
1750 
1751   // TODO: Should this propagate fast-math-flags?
1752   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1753   return true;
1754 }
1755 
1756 static MachineInstrBuilder extractF64Exponent(Register Hi,
1757                                               MachineIRBuilder &B) {
1758   const unsigned FractBits = 52;
1759   const unsigned ExpBits = 11;
1760   LLT S32 = LLT::scalar(32);
1761 
1762   auto Const0 = B.buildConstant(S32, FractBits - 32);
1763   auto Const1 = B.buildConstant(S32, ExpBits);
1764 
1765   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1766     .addUse(Hi)
1767     .addUse(Const0.getReg(0))
1768     .addUse(Const1.getReg(0));
1769 
1770   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1771 }
1772 
1773 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1774   MachineInstr &MI, MachineRegisterInfo &MRI,
1775   MachineIRBuilder &B) const {
1776   const LLT S1 = LLT::scalar(1);
1777   const LLT S32 = LLT::scalar(32);
1778   const LLT S64 = LLT::scalar(64);
1779 
1780   Register Src = MI.getOperand(1).getReg();
1781   assert(MRI.getType(Src) == S64);
1782 
1783   // TODO: Should this use extract since the low half is unused?
1784   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1785   Register Hi = Unmerge.getReg(1);
1786 
1787   // Extract the upper half, since this is where we will find the sign and
1788   // exponent.
1789   auto Exp = extractF64Exponent(Hi, B);
1790 
1791   const unsigned FractBits = 52;
1792 
1793   // Extract the sign bit.
1794   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1795   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1796 
1797   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1798 
1799   const auto Zero32 = B.buildConstant(S32, 0);
1800 
1801   // Extend back to 64-bits.
1802   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1803 
1804   auto Shr = B.buildAShr(S64, FractMask, Exp);
1805   auto Not = B.buildNot(S64, Shr);
1806   auto Tmp0 = B.buildAnd(S64, Src, Not);
1807   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1808 
1809   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1810   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1811 
1812   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1813   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1814   MI.eraseFromParent();
1815   return true;
1816 }
1817 
1818 bool AMDGPULegalizerInfo::legalizeITOFP(
1819   MachineInstr &MI, MachineRegisterInfo &MRI,
1820   MachineIRBuilder &B, bool Signed) const {
1821 
1822   Register Dst = MI.getOperand(0).getReg();
1823   Register Src = MI.getOperand(1).getReg();
1824 
1825   const LLT S64 = LLT::scalar(64);
1826   const LLT S32 = LLT::scalar(32);
1827 
1828   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1829 
1830   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1831 
1832   auto CvtHi = Signed ?
1833     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1834     B.buildUITOFP(S64, Unmerge.getReg(1));
1835 
1836   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1837 
1838   auto ThirtyTwo = B.buildConstant(S32, 32);
1839   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1840     .addUse(CvtHi.getReg(0))
1841     .addUse(ThirtyTwo.getReg(0));
1842 
1843   // TODO: Should this propagate fast-math-flags?
1844   B.buildFAdd(Dst, LdExp, CvtLo);
1845   MI.eraseFromParent();
1846   return true;
1847 }
1848 
1849 // TODO: Copied from DAG implementation. Verify logic and document how this
1850 // actually works.
1851 bool AMDGPULegalizerInfo::legalizeFPTOI(
1852   MachineInstr &MI, MachineRegisterInfo &MRI,
1853   MachineIRBuilder &B, bool Signed) const {
1854 
1855   Register Dst = MI.getOperand(0).getReg();
1856   Register Src = MI.getOperand(1).getReg();
1857 
1858   const LLT S64 = LLT::scalar(64);
1859   const LLT S32 = LLT::scalar(32);
1860 
1861   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1862 
1863   unsigned Flags = MI.getFlags();
1864 
1865   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1866   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1867   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1868 
1869   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1870   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1871   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1872 
1873   auto Hi = Signed ?
1874     B.buildFPTOSI(S32, FloorMul) :
1875     B.buildFPTOUI(S32, FloorMul);
1876   auto Lo = B.buildFPTOUI(S32, Fma);
1877 
1878   B.buildMerge(Dst, { Lo, Hi });
1879   MI.eraseFromParent();
1880 
1881   return true;
1882 }
1883 
1884 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1885                                                MachineInstr &MI) const {
1886   MachineFunction &MF = Helper.MIRBuilder.getMF();
1887   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1888 
1889   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1890                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1891 
1892   // With ieee_mode disabled, the instructions have the correct behavior
1893   // already for G_FMINNUM/G_FMAXNUM
1894   if (!MFI->getMode().IEEE)
1895     return !IsIEEEOp;
1896 
1897   if (IsIEEEOp)
1898     return true;
1899 
1900   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1901 }
1902 
1903 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1904   MachineInstr &MI, MachineRegisterInfo &MRI,
1905   MachineIRBuilder &B) const {
1906   // TODO: Should move some of this into LegalizerHelper.
1907 
1908   // TODO: Promote dynamic indexing of s16 to s32
1909 
1910   // FIXME: Artifact combiner probably should have replaced the truncated
1911   // constant before this, so we shouldn't need
1912   // getConstantVRegValWithLookThrough.
1913   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1914     MI.getOperand(2).getReg(), MRI);
1915   if (!IdxVal) // Dynamic case will be selected to register indexing.
1916     return true;
1917 
1918   Register Dst = MI.getOperand(0).getReg();
1919   Register Vec = MI.getOperand(1).getReg();
1920 
1921   LLT VecTy = MRI.getType(Vec);
1922   LLT EltTy = VecTy.getElementType();
1923   assert(EltTy == MRI.getType(Dst));
1924 
1925   if (IdxVal->Value < VecTy.getNumElements())
1926     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1927   else
1928     B.buildUndef(Dst);
1929 
1930   MI.eraseFromParent();
1931   return true;
1932 }
1933 
1934 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1935   MachineInstr &MI, MachineRegisterInfo &MRI,
1936   MachineIRBuilder &B) const {
1937   // TODO: Should move some of this into LegalizerHelper.
1938 
1939   // TODO: Promote dynamic indexing of s16 to s32
1940 
1941   // FIXME: Artifact combiner probably should have replaced the truncated
1942   // constant before this, so we shouldn't need
1943   // getConstantVRegValWithLookThrough.
1944   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1945     MI.getOperand(3).getReg(), MRI);
1946   if (!IdxVal) // Dynamic case will be selected to register indexing.
1947     return true;
1948 
1949   Register Dst = MI.getOperand(0).getReg();
1950   Register Vec = MI.getOperand(1).getReg();
1951   Register Ins = MI.getOperand(2).getReg();
1952 
1953   LLT VecTy = MRI.getType(Vec);
1954   LLT EltTy = VecTy.getElementType();
1955   assert(EltTy == MRI.getType(Ins));
1956 
1957   if (IdxVal->Value < VecTy.getNumElements())
1958     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1959   else
1960     B.buildUndef(Dst);
1961 
1962   MI.eraseFromParent();
1963   return true;
1964 }
1965 
1966 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1967   MachineInstr &MI, MachineRegisterInfo &MRI,
1968   MachineIRBuilder &B) const {
1969   const LLT V2S16 = LLT::vector(2, 16);
1970 
1971   Register Dst = MI.getOperand(0).getReg();
1972   Register Src0 = MI.getOperand(1).getReg();
1973   LLT DstTy = MRI.getType(Dst);
1974   LLT SrcTy = MRI.getType(Src0);
1975 
1976   if (SrcTy == V2S16 && DstTy == V2S16 &&
1977       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1978     return true;
1979 
1980   MachineIRBuilder HelperBuilder(MI);
1981   GISelObserverWrapper DummyObserver;
1982   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1983   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1984 }
1985 
1986 bool AMDGPULegalizerInfo::legalizeSinCos(
1987   MachineInstr &MI, MachineRegisterInfo &MRI,
1988   MachineIRBuilder &B) const {
1989 
1990   Register DstReg = MI.getOperand(0).getReg();
1991   Register SrcReg = MI.getOperand(1).getReg();
1992   LLT Ty = MRI.getType(DstReg);
1993   unsigned Flags = MI.getFlags();
1994 
1995   Register TrigVal;
1996   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1997   if (ST.hasTrigReducedRange()) {
1998     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1999     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2000       .addUse(MulVal.getReg(0))
2001       .setMIFlags(Flags).getReg(0);
2002   } else
2003     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2004 
2005   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2006     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2007   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2008     .addUse(TrigVal)
2009     .setMIFlags(Flags);
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2015                                                   MachineIRBuilder &B,
2016                                                   const GlobalValue *GV,
2017                                                   int64_t Offset,
2018                                                   unsigned GAFlags) const {
2019   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2020   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2021   // to the following code sequence:
2022   //
2023   // For constant address space:
2024   //   s_getpc_b64 s[0:1]
2025   //   s_add_u32 s0, s0, $symbol
2026   //   s_addc_u32 s1, s1, 0
2027   //
2028   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2029   //   a fixup or relocation is emitted to replace $symbol with a literal
2030   //   constant, which is a pc-relative offset from the encoding of the $symbol
2031   //   operand to the global variable.
2032   //
2033   // For global address space:
2034   //   s_getpc_b64 s[0:1]
2035   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2036   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2037   //
2038   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2039   //   fixups or relocations are emitted to replace $symbol@*@lo and
2040   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2041   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2042   //   operand to the global variable.
2043   //
2044   // What we want here is an offset from the value returned by s_getpc
2045   // (which is the address of the s_add_u32 instruction) to the global
2046   // variable, but since the encoding of $symbol starts 4 bytes after the start
2047   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2048   // small. This requires us to add 4 to the global variable offset in order to
2049   // compute the correct address.
2050 
2051   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2052 
2053   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2054     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2055 
2056   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2057     .addDef(PCReg);
2058 
2059   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2060   if (GAFlags == SIInstrInfo::MO_NONE)
2061     MIB.addImm(0);
2062   else
2063     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2064 
2065   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2066 
2067   if (PtrTy.getSizeInBits() == 32)
2068     B.buildExtract(DstReg, PCReg, 0);
2069   return true;
2070  }
2071 
2072 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2073   MachineInstr &MI, MachineRegisterInfo &MRI,
2074   MachineIRBuilder &B) const {
2075   Register DstReg = MI.getOperand(0).getReg();
2076   LLT Ty = MRI.getType(DstReg);
2077   unsigned AS = Ty.getAddressSpace();
2078 
2079   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2080   MachineFunction &MF = B.getMF();
2081   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2082 
2083   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2084     if (!MFI->isEntryFunction()) {
2085       const Function &Fn = MF.getFunction();
2086       DiagnosticInfoUnsupported BadLDSDecl(
2087         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2088         DS_Warning);
2089       Fn.getContext().diagnose(BadLDSDecl);
2090 
2091       // We currently don't have a way to correctly allocate LDS objects that
2092       // aren't directly associated with a kernel. We do force inlining of
2093       // functions that use local objects. However, if these dead functions are
2094       // not eliminated, we don't want a compile time error. Just emit a warning
2095       // and a trap, since there should be no callable path here.
2096       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2097       B.buildUndef(DstReg);
2098       MI.eraseFromParent();
2099       return true;
2100     }
2101 
2102     // TODO: We could emit code to handle the initialization somewhere.
2103     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2104       const SITargetLowering *TLI = ST.getTargetLowering();
2105       if (!TLI->shouldUseLDSConstAddress(GV)) {
2106         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2107         return true; // Leave in place;
2108       }
2109 
2110       B.buildConstant(
2111           DstReg,
2112           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2113       MI.eraseFromParent();
2114       return true;
2115     }
2116 
2117     const Function &Fn = MF.getFunction();
2118     DiagnosticInfoUnsupported BadInit(
2119       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2120     Fn.getContext().diagnose(BadInit);
2121     return true;
2122   }
2123 
2124   const SITargetLowering *TLI = ST.getTargetLowering();
2125 
2126   if (TLI->shouldEmitFixup(GV)) {
2127     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2128     MI.eraseFromParent();
2129     return true;
2130   }
2131 
2132   if (TLI->shouldEmitPCReloc(GV)) {
2133     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2134     MI.eraseFromParent();
2135     return true;
2136   }
2137 
2138   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2139   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2140 
2141   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2142       MachinePointerInfo::getGOT(MF),
2143       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2144           MachineMemOperand::MOInvariant,
2145       8 /*Size*/, Align(8));
2146 
2147   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2148 
2149   if (Ty.getSizeInBits() == 32) {
2150     // Truncate if this is a 32-bit constant adrdess.
2151     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2152     B.buildExtract(DstReg, Load, 0);
2153   } else
2154     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2155 
2156   MI.eraseFromParent();
2157   return true;
2158 }
2159 
2160 bool AMDGPULegalizerInfo::legalizeLoad(
2161   MachineInstr &MI, MachineRegisterInfo &MRI,
2162   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2163   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2164   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2165   Observer.changingInstr(MI);
2166   MI.getOperand(1).setReg(Cast.getReg(0));
2167   Observer.changedInstr(MI);
2168   return true;
2169 }
2170 
2171 bool AMDGPULegalizerInfo::legalizeFMad(
2172   MachineInstr &MI, MachineRegisterInfo &MRI,
2173   MachineIRBuilder &B) const {
2174   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2175   assert(Ty.isScalar());
2176 
2177   MachineFunction &MF = B.getMF();
2178   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2179 
2180   // TODO: Always legal with future ftz flag.
2181   // FIXME: Do we need just output?
2182   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2183     return true;
2184   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2185     return true;
2186 
2187   MachineIRBuilder HelperBuilder(MI);
2188   GISelObserverWrapper DummyObserver;
2189   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2190   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2191 }
2192 
2193 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2194   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2195   Register DstReg = MI.getOperand(0).getReg();
2196   Register PtrReg = MI.getOperand(1).getReg();
2197   Register CmpVal = MI.getOperand(2).getReg();
2198   Register NewVal = MI.getOperand(3).getReg();
2199 
2200   assert(SITargetLowering::isFlatGlobalAddrSpace(
2201            MRI.getType(PtrReg).getAddressSpace()) &&
2202          "this should not have been custom lowered");
2203 
2204   LLT ValTy = MRI.getType(CmpVal);
2205   LLT VecTy = LLT::vector(2, ValTy);
2206 
2207   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2208 
2209   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2210     .addDef(DstReg)
2211     .addUse(PtrReg)
2212     .addUse(PackedVal)
2213     .setMemRefs(MI.memoperands());
2214 
2215   MI.eraseFromParent();
2216   return true;
2217 }
2218 
2219 bool AMDGPULegalizerInfo::legalizeFlog(
2220   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2221   Register Dst = MI.getOperand(0).getReg();
2222   Register Src = MI.getOperand(1).getReg();
2223   LLT Ty = B.getMRI()->getType(Dst);
2224   unsigned Flags = MI.getFlags();
2225 
2226   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2227   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2228 
2229   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2230   MI.eraseFromParent();
2231   return true;
2232 }
2233 
2234 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2235                                        MachineIRBuilder &B) const {
2236   Register Dst = MI.getOperand(0).getReg();
2237   Register Src = MI.getOperand(1).getReg();
2238   unsigned Flags = MI.getFlags();
2239   LLT Ty = B.getMRI()->getType(Dst);
2240 
2241   auto K = B.buildFConstant(Ty, numbers::log2e);
2242   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2243   B.buildFExp2(Dst, Mul, Flags);
2244   MI.eraseFromParent();
2245   return true;
2246 }
2247 
2248 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2249                                        MachineIRBuilder &B) const {
2250   Register Dst = MI.getOperand(0).getReg();
2251   Register Src0 = MI.getOperand(1).getReg();
2252   Register Src1 = MI.getOperand(2).getReg();
2253   unsigned Flags = MI.getFlags();
2254   LLT Ty = B.getMRI()->getType(Dst);
2255   const LLT S16 = LLT::scalar(16);
2256   const LLT S32 = LLT::scalar(32);
2257 
2258   if (Ty == S32) {
2259     auto Log = B.buildFLog2(S32, Src0, Flags);
2260     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2261       .addUse(Log.getReg(0))
2262       .addUse(Src1)
2263       .setMIFlags(Flags);
2264     B.buildFExp2(Dst, Mul, Flags);
2265   } else if (Ty == S16) {
2266     // There's no f16 fmul_legacy, so we need to convert for it.
2267     auto Log = B.buildFLog2(S16, Src0, Flags);
2268     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2269     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2270     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2271       .addUse(Ext0.getReg(0))
2272       .addUse(Ext1.getReg(0))
2273       .setMIFlags(Flags);
2274 
2275     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2276   } else
2277     return false;
2278 
2279   MI.eraseFromParent();
2280   return true;
2281 }
2282 
2283 // Find a source register, ignoring any possible source modifiers.
2284 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2285   Register ModSrc = OrigSrc;
2286   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2287     ModSrc = SrcFNeg->getOperand(1).getReg();
2288     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2289       ModSrc = SrcFAbs->getOperand(1).getReg();
2290   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2291     ModSrc = SrcFAbs->getOperand(1).getReg();
2292   return ModSrc;
2293 }
2294 
2295 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2296                                          MachineRegisterInfo &MRI,
2297                                          MachineIRBuilder &B) const {
2298 
2299   const LLT S1 = LLT::scalar(1);
2300   const LLT S64 = LLT::scalar(64);
2301   Register Dst = MI.getOperand(0).getReg();
2302   Register OrigSrc = MI.getOperand(1).getReg();
2303   unsigned Flags = MI.getFlags();
2304   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2305          "this should not have been custom lowered");
2306 
2307   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2308   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2309   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2310   // V_FRACT bug is:
2311   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2312   //
2313   // Convert floor(x) to (x - fract(x))
2314 
2315   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2316     .addUse(OrigSrc)
2317     .setMIFlags(Flags);
2318 
2319   // Give source modifier matching some assistance before obscuring a foldable
2320   // pattern.
2321 
2322   // TODO: We can avoid the neg on the fract? The input sign to fract
2323   // shouldn't matter?
2324   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2325 
2326   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2327 
2328   Register Min = MRI.createGenericVirtualRegister(S64);
2329 
2330   // We don't need to concern ourselves with the snan handling difference, so
2331   // use the one which will directly select.
2332   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2333   if (MFI->getMode().IEEE)
2334     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2335   else
2336     B.buildFMinNum(Min, Fract, Const, Flags);
2337 
2338   Register CorrectedFract = Min;
2339   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2340     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2341     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2342   }
2343 
2344   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2345   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2346 
2347   MI.eraseFromParent();
2348   return true;
2349 }
2350 
2351 // Turn an illegal packed v2s16 build vector into bit operations.
2352 // TODO: This should probably be a bitcast action in LegalizerHelper.
2353 bool AMDGPULegalizerInfo::legalizeBuildVector(
2354   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2355   Register Dst = MI.getOperand(0).getReg();
2356   const LLT S32 = LLT::scalar(32);
2357   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2358 
2359   Register Src0 = MI.getOperand(1).getReg();
2360   Register Src1 = MI.getOperand(2).getReg();
2361   assert(MRI.getType(Src0) == LLT::scalar(16));
2362 
2363   auto Merge = B.buildMerge(S32, {Src0, Src1});
2364   B.buildBitcast(Dst, Merge);
2365 
2366   MI.eraseFromParent();
2367   return true;
2368 }
2369 
2370 // Return the use branch instruction, otherwise null if the usage is invalid.
2371 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2372                                        MachineRegisterInfo &MRI,
2373                                        MachineInstr *&Br,
2374                                        MachineBasicBlock *&UncondBrTarget) {
2375   Register CondDef = MI.getOperand(0).getReg();
2376   if (!MRI.hasOneNonDBGUse(CondDef))
2377     return nullptr;
2378 
2379   MachineBasicBlock *Parent = MI.getParent();
2380   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2381   if (UseMI.getParent() != Parent ||
2382       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2383     return nullptr;
2384 
2385   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2386   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2387   if (Next == Parent->end()) {
2388     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2389     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2390       return nullptr;
2391     UncondBrTarget = &*NextMBB;
2392   } else {
2393     if (Next->getOpcode() != AMDGPU::G_BR)
2394       return nullptr;
2395     Br = &*Next;
2396     UncondBrTarget = Br->getOperand(0).getMBB();
2397   }
2398 
2399   return &UseMI;
2400 }
2401 
2402 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2403                                                MachineRegisterInfo &MRI,
2404                                                Register LiveIn,
2405                                                Register PhyReg) const {
2406   assert(PhyReg.isPhysical() && "Physical register expected");
2407 
2408   // Insert the live-in copy, if required, by defining destination virtual
2409   // register.
2410   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2411   if (!MRI.getVRegDef(LiveIn)) {
2412     // FIXME: Should have scoped insert pt
2413     MachineBasicBlock &OrigInsBB = B.getMBB();
2414     auto OrigInsPt = B.getInsertPt();
2415 
2416     MachineBasicBlock &EntryMBB = B.getMF().front();
2417     EntryMBB.addLiveIn(PhyReg);
2418     B.setInsertPt(EntryMBB, EntryMBB.begin());
2419     B.buildCopy(LiveIn, PhyReg);
2420 
2421     B.setInsertPt(OrigInsBB, OrigInsPt);
2422   }
2423 
2424   return LiveIn;
2425 }
2426 
2427 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2428                                                 MachineRegisterInfo &MRI,
2429                                                 Register PhyReg, LLT Ty,
2430                                                 bool InsertLiveInCopy) const {
2431   assert(PhyReg.isPhysical() && "Physical register expected");
2432 
2433   // Get or create virtual live-in regester
2434   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2435   if (!LiveIn) {
2436     LiveIn = MRI.createGenericVirtualRegister(Ty);
2437     MRI.addLiveIn(PhyReg, LiveIn);
2438   }
2439 
2440   // When the actual true copy required is from virtual register to physical
2441   // register (to be inserted later), live-in copy insertion from physical
2442   // to register virtual register is not required
2443   if (!InsertLiveInCopy)
2444     return LiveIn;
2445 
2446   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2447 }
2448 
2449 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2450     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2451   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2452   const ArgDescriptor *Arg;
2453   const TargetRegisterClass *RC;
2454   LLT ArgTy;
2455   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2456   if (!Arg) {
2457     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2458     return nullptr;
2459   }
2460   return Arg;
2461 }
2462 
2463 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2464                                          const ArgDescriptor *Arg) const {
2465   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2466     return false; // TODO: Handle these
2467 
2468   Register SrcReg = Arg->getRegister();
2469   assert(SrcReg.isPhysical() && "Physical register expected");
2470   assert(DstReg.isVirtual() && "Virtual register expected");
2471 
2472   MachineRegisterInfo &MRI = *B.getMRI();
2473 
2474   LLT Ty = MRI.getType(DstReg);
2475   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2476 
2477   if (Arg->isMasked()) {
2478     // TODO: Should we try to emit this once in the entry block?
2479     const LLT S32 = LLT::scalar(32);
2480     const unsigned Mask = Arg->getMask();
2481     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2482 
2483     Register AndMaskSrc = LiveIn;
2484 
2485     if (Shift != 0) {
2486       auto ShiftAmt = B.buildConstant(S32, Shift);
2487       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2488     }
2489 
2490     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2491   } else {
2492     B.buildCopy(DstReg, LiveIn);
2493   }
2494 
2495   return true;
2496 }
2497 
2498 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2499     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2500     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2501 
2502   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2503   if (!Arg)
2504     return false;
2505 
2506   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2507     return false;
2508 
2509   MI.eraseFromParent();
2510   return true;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2514                                        MachineRegisterInfo &MRI,
2515                                        MachineIRBuilder &B) const {
2516   Register Dst = MI.getOperand(0).getReg();
2517   LLT DstTy = MRI.getType(Dst);
2518   LLT S16 = LLT::scalar(16);
2519   LLT S32 = LLT::scalar(32);
2520   LLT S64 = LLT::scalar(64);
2521 
2522   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2523     return true;
2524 
2525   if (DstTy == S16)
2526     return legalizeFDIV16(MI, MRI, B);
2527   if (DstTy == S32)
2528     return legalizeFDIV32(MI, MRI, B);
2529   if (DstTy == S64)
2530     return legalizeFDIV64(MI, MRI, B);
2531 
2532   return false;
2533 }
2534 
2535 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2536                                                   Register DstReg,
2537                                                   Register X,
2538                                                   Register Y,
2539                                                   bool IsDiv) const {
2540   const LLT S1 = LLT::scalar(1);
2541   const LLT S32 = LLT::scalar(32);
2542 
2543   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2544   // algorithm used here.
2545 
2546   // Initial estimate of inv(y).
2547   auto FloatY = B.buildUITOFP(S32, Y);
2548   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2549   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2550   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2551   auto Z = B.buildFPTOUI(S32, ScaledY);
2552 
2553   // One round of UNR.
2554   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2555   auto NegYZ = B.buildMul(S32, NegY, Z);
2556   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2557 
2558   // Quotient/remainder estimate.
2559   auto Q = B.buildUMulH(S32, X, Z);
2560   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2561 
2562   // First quotient/remainder refinement.
2563   auto One = B.buildConstant(S32, 1);
2564   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2565   if (IsDiv)
2566     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2567   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2568 
2569   // Second quotient/remainder refinement.
2570   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2571   if (IsDiv)
2572     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2573   else
2574     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2575 }
2576 
2577 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2578                                               MachineRegisterInfo &MRI,
2579                                               MachineIRBuilder &B) const {
2580   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2581   Register DstReg = MI.getOperand(0).getReg();
2582   Register Num = MI.getOperand(1).getReg();
2583   Register Den = MI.getOperand(2).getReg();
2584   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2585   MI.eraseFromParent();
2586   return true;
2587 }
2588 
2589 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2590 //
2591 // Return lo, hi of result
2592 //
2593 // %cvt.lo = G_UITOFP Val.lo
2594 // %cvt.hi = G_UITOFP Val.hi
2595 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2596 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2597 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2598 // %mul2 = G_FMUL %mul1, 2**(-32)
2599 // %trunc = G_INTRINSIC_TRUNC %mul2
2600 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2601 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2602 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2603                                                        Register Val) {
2604   const LLT S32 = LLT::scalar(32);
2605   auto Unmerge = B.buildUnmerge(S32, Val);
2606 
2607   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2608   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2609 
2610   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2611                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2612 
2613   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2614   auto Mul1 =
2615       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2616 
2617   // 2**(-32)
2618   auto Mul2 =
2619       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2620   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2621 
2622   // -(2**32)
2623   auto Mad2 = B.buildFMAD(S32, Trunc,
2624                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2625 
2626   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2627   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2628 
2629   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2630 }
2631 
2632 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2633                                                   Register DstReg,
2634                                                   Register Numer,
2635                                                   Register Denom,
2636                                                   bool IsDiv) const {
2637   const LLT S32 = LLT::scalar(32);
2638   const LLT S64 = LLT::scalar(64);
2639   const LLT S1 = LLT::scalar(1);
2640   Register RcpLo, RcpHi;
2641 
2642   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2643 
2644   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2645 
2646   auto Zero64 = B.buildConstant(S64, 0);
2647   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2648 
2649   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2650   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2651 
2652   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2653   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2654   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2655 
2656   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2657   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2658   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2659   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2660 
2661   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2662   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2663   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2664   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2665   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2666 
2667   auto Zero32 = B.buildConstant(S32, 0);
2668   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2669   auto Add2_HiC =
2670       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2671   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2672   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2673 
2674   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2675   Register NumerLo = UnmergeNumer.getReg(0);
2676   Register NumerHi = UnmergeNumer.getReg(1);
2677 
2678   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2679   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2680   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2681   Register Mul3_Lo = UnmergeMul3.getReg(0);
2682   Register Mul3_Hi = UnmergeMul3.getReg(1);
2683   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2684   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2685   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2686   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2687 
2688   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2689   Register DenomLo = UnmergeDenom.getReg(0);
2690   Register DenomHi = UnmergeDenom.getReg(1);
2691 
2692   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2693   auto C1 = B.buildSExt(S32, CmpHi);
2694 
2695   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2696   auto C2 = B.buildSExt(S32, CmpLo);
2697 
2698   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2699   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2700 
2701   // TODO: Here and below portions of the code can be enclosed into if/endif.
2702   // Currently control flow is unconditional and we have 4 selects after
2703   // potential endif to substitute PHIs.
2704 
2705   // if C3 != 0 ...
2706   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2707   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2708   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2709   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2710 
2711   auto One64 = B.buildConstant(S64, 1);
2712   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2713 
2714   auto C4 =
2715       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2716   auto C5 =
2717       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2718   auto C6 = B.buildSelect(
2719       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2720 
2721   // if (C6 != 0)
2722   auto Add4 = B.buildAdd(S64, Add3, One64);
2723   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2724 
2725   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2726   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2727   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2728 
2729   // endif C6
2730   // endif C3
2731 
2732   if (IsDiv) {
2733     auto Sel1 = B.buildSelect(
2734         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2735     B.buildSelect(DstReg,
2736                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2737   } else {
2738     auto Sel2 = B.buildSelect(
2739         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2740     B.buildSelect(DstReg,
2741                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2742   }
2743 }
2744 
2745 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2746                                             MachineRegisterInfo &MRI,
2747                                             MachineIRBuilder &B) const {
2748   const LLT S64 = LLT::scalar(64);
2749   const LLT S32 = LLT::scalar(32);
2750   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2751   Register DstReg = MI.getOperand(0).getReg();
2752   Register Num = MI.getOperand(1).getReg();
2753   Register Den = MI.getOperand(2).getReg();
2754   LLT Ty = MRI.getType(DstReg);
2755 
2756   if (Ty == S32)
2757     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2758   else if (Ty == S64)
2759     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2760   else
2761     return false;
2762 
2763   MI.eraseFromParent();
2764   return true;
2765 
2766 }
2767 
2768 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2769                                             MachineRegisterInfo &MRI,
2770                                             MachineIRBuilder &B) const {
2771   const LLT S64 = LLT::scalar(64);
2772   const LLT S32 = LLT::scalar(32);
2773 
2774   Register DstReg = MI.getOperand(0).getReg();
2775   const LLT Ty = MRI.getType(DstReg);
2776   if (Ty != S32 && Ty != S64)
2777     return false;
2778 
2779   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2780 
2781   Register LHS = MI.getOperand(1).getReg();
2782   Register RHS = MI.getOperand(2).getReg();
2783 
2784   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2785   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2786   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2787 
2788   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2789   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2790 
2791   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2792   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2793 
2794   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2795   if (Ty == S32)
2796     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2797   else
2798     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2799 
2800   Register Sign;
2801   if (IsDiv)
2802     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2803   else
2804     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2805 
2806   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2807   B.buildSub(DstReg, UDivRem, Sign);
2808 
2809   MI.eraseFromParent();
2810   return true;
2811 }
2812 
2813 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2814                                                  MachineRegisterInfo &MRI,
2815                                                  MachineIRBuilder &B) const {
2816   Register Res = MI.getOperand(0).getReg();
2817   Register LHS = MI.getOperand(1).getReg();
2818   Register RHS = MI.getOperand(2).getReg();
2819 
2820   uint16_t Flags = MI.getFlags();
2821 
2822   LLT ResTy = MRI.getType(Res);
2823   LLT S32 = LLT::scalar(32);
2824   LLT S64 = LLT::scalar(64);
2825 
2826   const MachineFunction &MF = B.getMF();
2827   bool Unsafe =
2828     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2829 
2830   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2831     return false;
2832 
2833   if (!Unsafe && ResTy == S32 &&
2834       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2835     return false;
2836 
2837   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2838     // 1 / x -> RCP(x)
2839     if (CLHS->isExactlyValue(1.0)) {
2840       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2841         .addUse(RHS)
2842         .setMIFlags(Flags);
2843 
2844       MI.eraseFromParent();
2845       return true;
2846     }
2847 
2848     // -1 / x -> RCP( FNEG(x) )
2849     if (CLHS->isExactlyValue(-1.0)) {
2850       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2851       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2852         .addUse(FNeg.getReg(0))
2853         .setMIFlags(Flags);
2854 
2855       MI.eraseFromParent();
2856       return true;
2857     }
2858   }
2859 
2860   // x / y -> x * (1.0 / y)
2861   if (Unsafe) {
2862     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2863       .addUse(RHS)
2864       .setMIFlags(Flags);
2865     B.buildFMul(Res, LHS, RCP, Flags);
2866 
2867     MI.eraseFromParent();
2868     return true;
2869   }
2870 
2871   return false;
2872 }
2873 
2874 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2875                                          MachineRegisterInfo &MRI,
2876                                          MachineIRBuilder &B) const {
2877   Register Res = MI.getOperand(0).getReg();
2878   Register LHS = MI.getOperand(1).getReg();
2879   Register RHS = MI.getOperand(2).getReg();
2880 
2881   uint16_t Flags = MI.getFlags();
2882 
2883   LLT S16 = LLT::scalar(16);
2884   LLT S32 = LLT::scalar(32);
2885 
2886   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2887   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2888 
2889   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2890     .addUse(RHSExt.getReg(0))
2891     .setMIFlags(Flags);
2892 
2893   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2894   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2895 
2896   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2897     .addUse(RDst.getReg(0))
2898     .addUse(RHS)
2899     .addUse(LHS)
2900     .setMIFlags(Flags);
2901 
2902   MI.eraseFromParent();
2903   return true;
2904 }
2905 
2906 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2907 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2908 static void toggleSPDenormMode(bool Enable,
2909                                MachineIRBuilder &B,
2910                                const GCNSubtarget &ST,
2911                                AMDGPU::SIModeRegisterDefaults Mode) {
2912   // Set SP denorm mode to this value.
2913   unsigned SPDenormMode =
2914     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2915 
2916   if (ST.hasDenormModeInst()) {
2917     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2918     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2919 
2920     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2921     B.buildInstr(AMDGPU::S_DENORM_MODE)
2922       .addImm(NewDenormModeValue);
2923 
2924   } else {
2925     // Select FP32 bit field in mode register.
2926     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2927                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2928                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2929 
2930     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2931       .addImm(SPDenormMode)
2932       .addImm(SPDenormModeBitField);
2933   }
2934 }
2935 
2936 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2937                                          MachineRegisterInfo &MRI,
2938                                          MachineIRBuilder &B) const {
2939   Register Res = MI.getOperand(0).getReg();
2940   Register LHS = MI.getOperand(1).getReg();
2941   Register RHS = MI.getOperand(2).getReg();
2942   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2943   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2944 
2945   uint16_t Flags = MI.getFlags();
2946 
2947   LLT S32 = LLT::scalar(32);
2948   LLT S1 = LLT::scalar(1);
2949 
2950   auto One = B.buildFConstant(S32, 1.0f);
2951 
2952   auto DenominatorScaled =
2953     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2954       .addUse(LHS)
2955       .addUse(RHS)
2956       .addImm(0)
2957       .setMIFlags(Flags);
2958   auto NumeratorScaled =
2959     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2960       .addUse(LHS)
2961       .addUse(RHS)
2962       .addImm(1)
2963       .setMIFlags(Flags);
2964 
2965   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2966     .addUse(DenominatorScaled.getReg(0))
2967     .setMIFlags(Flags);
2968   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2969 
2970   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2971   // aren't modeled as reading it.
2972   if (!Mode.allFP32Denormals())
2973     toggleSPDenormMode(true, B, ST, Mode);
2974 
2975   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2976   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2977   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2978   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2979   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2980   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2981 
2982   if (!Mode.allFP32Denormals())
2983     toggleSPDenormMode(false, B, ST, Mode);
2984 
2985   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2986     .addUse(Fma4.getReg(0))
2987     .addUse(Fma1.getReg(0))
2988     .addUse(Fma3.getReg(0))
2989     .addUse(NumeratorScaled.getReg(1))
2990     .setMIFlags(Flags);
2991 
2992   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2993     .addUse(Fmas.getReg(0))
2994     .addUse(RHS)
2995     .addUse(LHS)
2996     .setMIFlags(Flags);
2997 
2998   MI.eraseFromParent();
2999   return true;
3000 }
3001 
3002 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3003                                          MachineRegisterInfo &MRI,
3004                                          MachineIRBuilder &B) const {
3005   Register Res = MI.getOperand(0).getReg();
3006   Register LHS = MI.getOperand(1).getReg();
3007   Register RHS = MI.getOperand(2).getReg();
3008 
3009   uint16_t Flags = MI.getFlags();
3010 
3011   LLT S64 = LLT::scalar(64);
3012   LLT S1 = LLT::scalar(1);
3013 
3014   auto One = B.buildFConstant(S64, 1.0);
3015 
3016   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3017     .addUse(LHS)
3018     .addUse(RHS)
3019     .addImm(0)
3020     .setMIFlags(Flags);
3021 
3022   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3023 
3024   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3025     .addUse(DivScale0.getReg(0))
3026     .setMIFlags(Flags);
3027 
3028   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3029   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3030   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3031 
3032   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3033     .addUse(LHS)
3034     .addUse(RHS)
3035     .addImm(1)
3036     .setMIFlags(Flags);
3037 
3038   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3039   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3040   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3041 
3042   Register Scale;
3043   if (!ST.hasUsableDivScaleConditionOutput()) {
3044     // Workaround a hardware bug on SI where the condition output from div_scale
3045     // is not usable.
3046 
3047     LLT S32 = LLT::scalar(32);
3048 
3049     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3050     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3051     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3052     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3053 
3054     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3055                               Scale1Unmerge.getReg(1));
3056     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3057                               Scale0Unmerge.getReg(1));
3058     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3059   } else {
3060     Scale = DivScale1.getReg(1);
3061   }
3062 
3063   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3064     .addUse(Fma4.getReg(0))
3065     .addUse(Fma3.getReg(0))
3066     .addUse(Mul.getReg(0))
3067     .addUse(Scale)
3068     .setMIFlags(Flags);
3069 
3070   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3071     .addUse(Fmas.getReg(0))
3072     .addUse(RHS)
3073     .addUse(LHS)
3074     .setMIFlags(Flags);
3075 
3076   MI.eraseFromParent();
3077   return true;
3078 }
3079 
3080 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3081                                                  MachineRegisterInfo &MRI,
3082                                                  MachineIRBuilder &B) const {
3083   Register Res = MI.getOperand(0).getReg();
3084   Register LHS = MI.getOperand(2).getReg();
3085   Register RHS = MI.getOperand(3).getReg();
3086   uint16_t Flags = MI.getFlags();
3087 
3088   LLT S32 = LLT::scalar(32);
3089   LLT S1 = LLT::scalar(1);
3090 
3091   auto Abs = B.buildFAbs(S32, RHS, Flags);
3092   const APFloat C0Val(1.0f);
3093 
3094   auto C0 = B.buildConstant(S32, 0x6f800000);
3095   auto C1 = B.buildConstant(S32, 0x2f800000);
3096   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3097 
3098   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3099   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3100 
3101   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3102 
3103   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3104     .addUse(Mul0.getReg(0))
3105     .setMIFlags(Flags);
3106 
3107   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3108 
3109   B.buildFMul(Res, Sel, Mul1, Flags);
3110 
3111   MI.eraseFromParent();
3112   return true;
3113 }
3114 
3115 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3116                                             MachineRegisterInfo &MRI,
3117                                             MachineIRBuilder &B) const {
3118   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3119   uint64_t Offset =
3120     ST.getTargetLowering()->getImplicitParameterOffset(
3121       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3122   LLT DstTy = MRI.getType(DstReg);
3123   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3124 
3125   const ArgDescriptor *Arg;
3126   const TargetRegisterClass *RC;
3127   LLT ArgTy;
3128   std::tie(Arg, RC, ArgTy) =
3129       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3130   if (!Arg)
3131     return false;
3132 
3133   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3134   if (!loadInputValue(KernargPtrReg, B, Arg))
3135     return false;
3136 
3137   // FIXME: This should be nuw
3138   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3139   return true;
3140 }
3141 
3142 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3143                                                  MachineRegisterInfo &MRI,
3144                                                  MachineIRBuilder &B) const {
3145   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3146   if (!MFI->isEntryFunction()) {
3147     return legalizePreloadedArgIntrin(MI, MRI, B,
3148                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3149   }
3150 
3151   Register DstReg = MI.getOperand(0).getReg();
3152   if (!getImplicitArgPtr(DstReg, MRI, B))
3153     return false;
3154 
3155   MI.eraseFromParent();
3156   return true;
3157 }
3158 
3159 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3160                                               MachineRegisterInfo &MRI,
3161                                               MachineIRBuilder &B,
3162                                               unsigned AddrSpace) const {
3163   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3164   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3165   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3166   MI.eraseFromParent();
3167   return true;
3168 }
3169 
3170 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3171 // offset (the offset that is included in bounds checking and swizzling, to be
3172 // split between the instruction's voffset and immoffset fields) and soffset
3173 // (the offset that is excluded from bounds checking and swizzling, to go in
3174 // the instruction's soffset field).  This function takes the first kind of
3175 // offset and figures out how to split it between voffset and immoffset.
3176 std::tuple<Register, unsigned, unsigned>
3177 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3178                                         Register OrigOffset) const {
3179   const unsigned MaxImm = 4095;
3180   Register BaseReg;
3181   unsigned TotalConstOffset;
3182   MachineInstr *OffsetDef;
3183   const LLT S32 = LLT::scalar(32);
3184 
3185   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3186     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3187 
3188   unsigned ImmOffset = TotalConstOffset;
3189 
3190   // If the immediate value is too big for the immoffset field, put the value
3191   // and -4096 into the immoffset field so that the value that is copied/added
3192   // for the voffset field is a multiple of 4096, and it stands more chance
3193   // of being CSEd with the copy/add for another similar load/store.
3194   // However, do not do that rounding down to a multiple of 4096 if that is a
3195   // negative number, as it appears to be illegal to have a negative offset
3196   // in the vgpr, even if adding the immediate offset makes it positive.
3197   unsigned Overflow = ImmOffset & ~MaxImm;
3198   ImmOffset -= Overflow;
3199   if ((int32_t)Overflow < 0) {
3200     Overflow += ImmOffset;
3201     ImmOffset = 0;
3202   }
3203 
3204   if (Overflow != 0) {
3205     if (!BaseReg) {
3206       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3207     } else {
3208       auto OverflowVal = B.buildConstant(S32, Overflow);
3209       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3210     }
3211   }
3212 
3213   if (!BaseReg)
3214     BaseReg = B.buildConstant(S32, 0).getReg(0);
3215 
3216   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3217 }
3218 
3219 /// Handle register layout difference for f16 images for some subtargets.
3220 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3221                                              MachineRegisterInfo &MRI,
3222                                              Register Reg) const {
3223   if (!ST.hasUnpackedD16VMem())
3224     return Reg;
3225 
3226   const LLT S16 = LLT::scalar(16);
3227   const LLT S32 = LLT::scalar(32);
3228   LLT StoreVT = MRI.getType(Reg);
3229   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3230 
3231   auto Unmerge = B.buildUnmerge(S16, Reg);
3232 
3233   SmallVector<Register, 4> WideRegs;
3234   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3235     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3236 
3237   int NumElts = StoreVT.getNumElements();
3238 
3239   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3240 }
3241 
3242 Register AMDGPULegalizerInfo::fixStoreSourceType(
3243   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3244   MachineRegisterInfo *MRI = B.getMRI();
3245   LLT Ty = MRI->getType(VData);
3246 
3247   const LLT S16 = LLT::scalar(16);
3248 
3249   // Fixup illegal register types for i8 stores.
3250   if (Ty == LLT::scalar(8) || Ty == S16) {
3251     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3252     return AnyExt;
3253   }
3254 
3255   if (Ty.isVector()) {
3256     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3257       if (IsFormat)
3258         return handleD16VData(B, *MRI, VData);
3259     }
3260   }
3261 
3262   return VData;
3263 }
3264 
3265 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3266                                               MachineRegisterInfo &MRI,
3267                                               MachineIRBuilder &B,
3268                                               bool IsTyped,
3269                                               bool IsFormat) const {
3270   Register VData = MI.getOperand(1).getReg();
3271   LLT Ty = MRI.getType(VData);
3272   LLT EltTy = Ty.getScalarType();
3273   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3274   const LLT S32 = LLT::scalar(32);
3275 
3276   VData = fixStoreSourceType(B, VData, IsFormat);
3277   Register RSrc = MI.getOperand(2).getReg();
3278 
3279   MachineMemOperand *MMO = *MI.memoperands_begin();
3280   const int MemSize = MMO->getSize();
3281 
3282   unsigned ImmOffset;
3283   unsigned TotalOffset;
3284 
3285   // The typed intrinsics add an immediate after the registers.
3286   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3287 
3288   // The struct intrinsic variants add one additional operand over raw.
3289   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3290   Register VIndex;
3291   int OpOffset = 0;
3292   if (HasVIndex) {
3293     VIndex = MI.getOperand(3).getReg();
3294     OpOffset = 1;
3295   }
3296 
3297   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3298   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3299 
3300   unsigned Format = 0;
3301   if (IsTyped) {
3302     Format = MI.getOperand(5 + OpOffset).getImm();
3303     ++OpOffset;
3304   }
3305 
3306   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3307 
3308   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3309   if (TotalOffset != 0)
3310     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3311 
3312   unsigned Opc;
3313   if (IsTyped) {
3314     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3315                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3316   } else if (IsFormat) {
3317     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3318                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3319   } else {
3320     switch (MemSize) {
3321     case 1:
3322       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3323       break;
3324     case 2:
3325       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3326       break;
3327     default:
3328       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3329       break;
3330     }
3331   }
3332 
3333   if (!VIndex)
3334     VIndex = B.buildConstant(S32, 0).getReg(0);
3335 
3336   auto MIB = B.buildInstr(Opc)
3337     .addUse(VData)              // vdata
3338     .addUse(RSrc)               // rsrc
3339     .addUse(VIndex)             // vindex
3340     .addUse(VOffset)            // voffset
3341     .addUse(SOffset)            // soffset
3342     .addImm(ImmOffset);         // offset(imm)
3343 
3344   if (IsTyped)
3345     MIB.addImm(Format);
3346 
3347   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3348      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3349      .addMemOperand(MMO);
3350 
3351   MI.eraseFromParent();
3352   return true;
3353 }
3354 
3355 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3356                                              MachineRegisterInfo &MRI,
3357                                              MachineIRBuilder &B,
3358                                              bool IsFormat,
3359                                              bool IsTyped) const {
3360   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3361   MachineMemOperand *MMO = *MI.memoperands_begin();
3362   const int MemSize = MMO->getSize();
3363   const LLT S32 = LLT::scalar(32);
3364 
3365   Register Dst = MI.getOperand(0).getReg();
3366   Register RSrc = MI.getOperand(2).getReg();
3367 
3368   // The typed intrinsics add an immediate after the registers.
3369   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3370 
3371   // The struct intrinsic variants add one additional operand over raw.
3372   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3373   Register VIndex;
3374   int OpOffset = 0;
3375   if (HasVIndex) {
3376     VIndex = MI.getOperand(3).getReg();
3377     OpOffset = 1;
3378   }
3379 
3380   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3381   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3382 
3383   unsigned Format = 0;
3384   if (IsTyped) {
3385     Format = MI.getOperand(5 + OpOffset).getImm();
3386     ++OpOffset;
3387   }
3388 
3389   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3390   unsigned ImmOffset;
3391   unsigned TotalOffset;
3392 
3393   LLT Ty = MRI.getType(Dst);
3394   LLT EltTy = Ty.getScalarType();
3395   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3396   const bool Unpacked = ST.hasUnpackedD16VMem();
3397 
3398   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3399   if (TotalOffset != 0)
3400     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3401 
3402   unsigned Opc;
3403 
3404   if (IsTyped) {
3405     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3406                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3407   } else if (IsFormat) {
3408     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3409                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3410   } else {
3411     switch (MemSize) {
3412     case 1:
3413       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3414       break;
3415     case 2:
3416       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3417       break;
3418     default:
3419       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3420       break;
3421     }
3422   }
3423 
3424   Register LoadDstReg;
3425 
3426   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3427   LLT UnpackedTy = Ty.changeElementSize(32);
3428 
3429   if (IsExtLoad)
3430     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3431   else if (Unpacked && IsD16 && Ty.isVector())
3432     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3433   else
3434     LoadDstReg = Dst;
3435 
3436   if (!VIndex)
3437     VIndex = B.buildConstant(S32, 0).getReg(0);
3438 
3439   auto MIB = B.buildInstr(Opc)
3440     .addDef(LoadDstReg)         // vdata
3441     .addUse(RSrc)               // rsrc
3442     .addUse(VIndex)             // vindex
3443     .addUse(VOffset)            // voffset
3444     .addUse(SOffset)            // soffset
3445     .addImm(ImmOffset);         // offset(imm)
3446 
3447   if (IsTyped)
3448     MIB.addImm(Format);
3449 
3450   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3451      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3452      .addMemOperand(MMO);
3453 
3454   if (LoadDstReg != Dst) {
3455     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3456 
3457     // Widen result for extending loads was widened.
3458     if (IsExtLoad)
3459       B.buildTrunc(Dst, LoadDstReg);
3460     else {
3461       // Repack to original 16-bit vector result
3462       // FIXME: G_TRUNC should work, but legalization currently fails
3463       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3464       SmallVector<Register, 4> Repack;
3465       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3466         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3467       B.buildMerge(Dst, Repack);
3468     }
3469   }
3470 
3471   MI.eraseFromParent();
3472   return true;
3473 }
3474 
3475 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3476                                                MachineIRBuilder &B,
3477                                                bool IsInc) const {
3478   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3479                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3480   B.buildInstr(Opc)
3481     .addDef(MI.getOperand(0).getReg())
3482     .addUse(MI.getOperand(2).getReg())
3483     .addUse(MI.getOperand(3).getReg())
3484     .cloneMemRefs(MI);
3485   MI.eraseFromParent();
3486   return true;
3487 }
3488 
3489 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3490   switch (IntrID) {
3491   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3492   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3493     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3494   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3495   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3496     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3497   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3498   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3499     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3500   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3501   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3502     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3503   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3504   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3505     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3506   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3507   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3508     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3509   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3510   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3511     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3512   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3513   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3514     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3515   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3516   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3517     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3518   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3519   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3520     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3521   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3522   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3523     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3524   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3525   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3526     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3527   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3528   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3529     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3530   default:
3531     llvm_unreachable("unhandled atomic opcode");
3532   }
3533 }
3534 
3535 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3536                                                MachineIRBuilder &B,
3537                                                Intrinsic::ID IID) const {
3538   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3539                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3540 
3541   Register Dst = MI.getOperand(0).getReg();
3542   Register VData = MI.getOperand(2).getReg();
3543 
3544   Register CmpVal;
3545   int OpOffset = 0;
3546 
3547   if (IsCmpSwap) {
3548     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3549     ++OpOffset;
3550   }
3551 
3552   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3553   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3554 
3555   // The struct intrinsic variants add one additional operand over raw.
3556   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3557   Register VIndex;
3558   if (HasVIndex) {
3559     VIndex = MI.getOperand(4 + OpOffset).getReg();
3560     ++OpOffset;
3561   }
3562 
3563   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3564   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3565   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3566 
3567   MachineMemOperand *MMO = *MI.memoperands_begin();
3568 
3569   unsigned ImmOffset;
3570   unsigned TotalOffset;
3571   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3572   if (TotalOffset != 0)
3573     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3574 
3575   if (!VIndex)
3576     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3577 
3578   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3579     .addDef(Dst)
3580     .addUse(VData); // vdata
3581 
3582   if (IsCmpSwap)
3583     MIB.addReg(CmpVal);
3584 
3585   MIB.addUse(RSrc)               // rsrc
3586      .addUse(VIndex)             // vindex
3587      .addUse(VOffset)            // voffset
3588      .addUse(SOffset)            // soffset
3589      .addImm(ImmOffset)          // offset(imm)
3590      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3591      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3592      .addMemOperand(MMO);
3593 
3594   MI.eraseFromParent();
3595   return true;
3596 }
3597 
3598 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3599 /// vector with s16 typed elements.
3600 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3601                                         SmallVectorImpl<Register> &PackedAddrs,
3602                                         int AddrIdx, int DimIdx, int EndIdx,
3603                                         int NumGradients) {
3604   const LLT S16 = LLT::scalar(16);
3605   const LLT V2S16 = LLT::vector(2, 16);
3606 
3607   for (int I = AddrIdx; I < EndIdx; ++I) {
3608     MachineOperand &SrcOp = MI.getOperand(I);
3609     if (!SrcOp.isReg())
3610       continue; // _L to _LZ may have eliminated this.
3611 
3612     Register AddrReg = SrcOp.getReg();
3613 
3614     if (I < DimIdx) {
3615       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3616       PackedAddrs.push_back(AddrReg);
3617     } else {
3618       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3619       // derivatives dx/dh and dx/dv are packed with undef.
3620       if (((I + 1) >= EndIdx) ||
3621           ((NumGradients / 2) % 2 == 1 &&
3622            (I == DimIdx + (NumGradients / 2) - 1 ||
3623             I == DimIdx + NumGradients - 1)) ||
3624           // Check for _L to _LZ optimization
3625           !MI.getOperand(I + 1).isReg()) {
3626         PackedAddrs.push_back(
3627             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3628                 .getReg(0));
3629       } else {
3630         PackedAddrs.push_back(
3631             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3632                 .getReg(0));
3633         ++I;
3634       }
3635     }
3636   }
3637 }
3638 
3639 /// Convert from separate vaddr components to a single vector address register,
3640 /// and replace the remaining operands with $noreg.
3641 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3642                                      int DimIdx, int NumVAddrs) {
3643   const LLT S32 = LLT::scalar(32);
3644 
3645   SmallVector<Register, 8> AddrRegs;
3646   for (int I = 0; I != NumVAddrs; ++I) {
3647     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3648     if (SrcOp.isReg()) {
3649       AddrRegs.push_back(SrcOp.getReg());
3650       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3651     }
3652   }
3653 
3654   int NumAddrRegs = AddrRegs.size();
3655   if (NumAddrRegs != 1) {
3656     // Round up to 8 elements for v5-v7
3657     // FIXME: Missing intermediate sized register classes and instructions.
3658     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3659       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3660       auto Undef = B.buildUndef(S32);
3661       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3662       NumAddrRegs = RoundedNumRegs;
3663     }
3664 
3665     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3666     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3667   }
3668 
3669   for (int I = 1; I != NumVAddrs; ++I) {
3670     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3671     if (SrcOp.isReg())
3672       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3673   }
3674 }
3675 
3676 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3677 ///
3678 /// Depending on the subtarget, load/store with 16-bit element data need to be
3679 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3680 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3681 /// registers.
3682 ///
3683 /// We don't want to directly select image instructions just yet, but also want
3684 /// to exposes all register repacking to the legalizer/combiners. We also don't
3685 /// want a selected instrution entering RegBankSelect. In order to avoid
3686 /// defining a multitude of intermediate image instructions, directly hack on
3687 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3688 /// now unnecessary arguments with $noreg.
3689 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3690     MachineInstr &MI, MachineIRBuilder &B,
3691     GISelChangeObserver &Observer,
3692     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3693 
3694   const int NumDefs = MI.getNumExplicitDefs();
3695   bool IsTFE = NumDefs == 2;
3696   // We are only processing the operands of d16 image operations on subtargets
3697   // that use the unpacked register layout, or need to repack the TFE result.
3698 
3699   // TODO: Do we need to guard against already legalized intrinsics?
3700   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3701     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3702 
3703   MachineRegisterInfo *MRI = B.getMRI();
3704   const LLT S32 = LLT::scalar(32);
3705   const LLT S16 = LLT::scalar(16);
3706   const LLT V2S16 = LLT::vector(2, 16);
3707 
3708   // Index of first address argument
3709   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3710 
3711   int NumVAddrs, NumGradients;
3712   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3713   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3714     getDMaskIdx(BaseOpcode, NumDefs);
3715   unsigned DMask = 0;
3716 
3717   // Check for 16 bit addresses and pack if true.
3718   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3719   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3720   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3721   const bool IsG16 = GradTy == S16;
3722   const bool IsA16 = AddrTy == S16;
3723 
3724   int DMaskLanes = 0;
3725   if (!BaseOpcode->Atomic) {
3726     DMask = MI.getOperand(DMaskIdx).getImm();
3727     if (BaseOpcode->Gather4) {
3728       DMaskLanes = 4;
3729     } else if (DMask != 0) {
3730       DMaskLanes = countPopulation(DMask);
3731     } else if (!IsTFE && !BaseOpcode->Store) {
3732       // If dmask is 0, this is a no-op load. This can be eliminated.
3733       B.buildUndef(MI.getOperand(0));
3734       MI.eraseFromParent();
3735       return true;
3736     }
3737   }
3738 
3739   Observer.changingInstr(MI);
3740   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3741 
3742   unsigned NewOpcode = NumDefs == 0 ?
3743     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3744 
3745   // Track that we legalized this
3746   MI.setDesc(B.getTII().get(NewOpcode));
3747 
3748   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3749   // dmask to be at least 1 otherwise the instruction will fail
3750   if (IsTFE && DMask == 0) {
3751     DMask = 0x1;
3752     DMaskLanes = 1;
3753     MI.getOperand(DMaskIdx).setImm(DMask);
3754   }
3755 
3756   if (BaseOpcode->Atomic) {
3757     Register VData0 = MI.getOperand(2).getReg();
3758     LLT Ty = MRI->getType(VData0);
3759 
3760     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3761     if (Ty.isVector())
3762       return false;
3763 
3764     if (BaseOpcode->AtomicX2) {
3765       Register VData1 = MI.getOperand(3).getReg();
3766       // The two values are packed in one register.
3767       LLT PackedTy = LLT::vector(2, Ty);
3768       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3769       MI.getOperand(2).setReg(Concat.getReg(0));
3770       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3771     }
3772   }
3773 
3774   int CorrectedNumVAddrs = NumVAddrs;
3775 
3776   // Optimize _L to _LZ when _L is zero
3777   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3778         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3779     const ConstantFP *ConstantLod;
3780     const int LodIdx = AddrIdx + NumVAddrs - 1;
3781 
3782     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3783       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3784         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3785         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3786           LZMappingInfo->LZ, ImageDimIntr->Dim);
3787 
3788         // The starting indexes should remain in the same place.
3789         --NumVAddrs;
3790         --CorrectedNumVAddrs;
3791 
3792         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3793           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3794         MI.RemoveOperand(LodIdx);
3795       }
3796     }
3797   }
3798 
3799   // Optimize _mip away, when 'lod' is zero
3800   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3801     int64_t ConstantLod;
3802     const int LodIdx = AddrIdx + NumVAddrs - 1;
3803 
3804     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3805       if (ConstantLod == 0) {
3806         // TODO: Change intrinsic opcode and remove operand instead or replacing
3807         // it with 0, as the _L to _LZ handling is done above.
3808         MI.getOperand(LodIdx).ChangeToImmediate(0);
3809         --CorrectedNumVAddrs;
3810       }
3811     }
3812   }
3813 
3814   // Rewrite the addressing register layout before doing anything else.
3815   if (IsA16 || IsG16) {
3816     if (IsA16) {
3817       // Target must support the feature and gradients need to be 16 bit too
3818       if (!ST.hasA16() || !IsG16)
3819         return false;
3820     } else if (!ST.hasG16())
3821       return false;
3822 
3823     if (NumVAddrs > 1) {
3824       SmallVector<Register, 4> PackedRegs;
3825       // Don't compress addresses for G16
3826       const int PackEndIdx =
3827           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3828       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3829                                   PackEndIdx, NumGradients);
3830 
3831       if (!IsA16) {
3832         // Add uncompressed address
3833         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3834           int AddrReg = MI.getOperand(I).getReg();
3835           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3836           PackedRegs.push_back(AddrReg);
3837         }
3838       }
3839 
3840       // See also below in the non-a16 branch
3841       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3842 
3843       if (!UseNSA && PackedRegs.size() > 1) {
3844         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3845         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3846         PackedRegs[0] = Concat.getReg(0);
3847         PackedRegs.resize(1);
3848       }
3849 
3850       const int NumPacked = PackedRegs.size();
3851       for (int I = 0; I != NumVAddrs; ++I) {
3852         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3853         if (!SrcOp.isReg()) {
3854           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3855           continue;
3856         }
3857 
3858         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3859 
3860         if (I < NumPacked)
3861           SrcOp.setReg(PackedRegs[I]);
3862         else
3863           SrcOp.setReg(AMDGPU::NoRegister);
3864       }
3865     }
3866   } else {
3867     // If the register allocator cannot place the address registers contiguously
3868     // without introducing moves, then using the non-sequential address encoding
3869     // is always preferable, since it saves VALU instructions and is usually a
3870     // wash in terms of code size or even better.
3871     //
3872     // However, we currently have no way of hinting to the register allocator
3873     // that MIMG addresses should be placed contiguously when it is possible to
3874     // do so, so force non-NSA for the common 2-address case as a heuristic.
3875     //
3876     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3877     // allocation when possible.
3878     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3879 
3880     if (!UseNSA && NumVAddrs > 1)
3881       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3882   }
3883 
3884   int Flags = 0;
3885   if (IsA16)
3886     Flags |= 1;
3887   if (IsG16)
3888     Flags |= 2;
3889   MI.addOperand(MachineOperand::CreateImm(Flags));
3890 
3891   if (BaseOpcode->Store) { // No TFE for stores?
3892     // TODO: Handle dmask trim
3893     Register VData = MI.getOperand(1).getReg();
3894     LLT Ty = MRI->getType(VData);
3895     if (!Ty.isVector() || Ty.getElementType() != S16)
3896       return true;
3897 
3898     Register RepackedReg = handleD16VData(B, *MRI, VData);
3899     if (RepackedReg != VData) {
3900       MI.getOperand(1).setReg(RepackedReg);
3901     }
3902 
3903     return true;
3904   }
3905 
3906   Register DstReg = MI.getOperand(0).getReg();
3907   LLT Ty = MRI->getType(DstReg);
3908   const LLT EltTy = Ty.getScalarType();
3909   const bool IsD16 = Ty.getScalarType() == S16;
3910   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3911 
3912   // Confirm that the return type is large enough for the dmask specified
3913   if (NumElts < DMaskLanes)
3914     return false;
3915 
3916   if (NumElts > 4 || DMaskLanes > 4)
3917     return false;
3918 
3919   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3920   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3921 
3922   // The raw dword aligned data component of the load. The only legal cases
3923   // where this matters should be when using the packed D16 format, for
3924   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3925   LLT RoundedTy;
3926 
3927   // S32 vector to to cover all data, plus TFE result element.
3928   LLT TFETy;
3929 
3930   // Register type to use for each loaded component. Will be S32 or V2S16.
3931   LLT RegTy;
3932 
3933   if (IsD16 && ST.hasUnpackedD16VMem()) {
3934     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3935     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3936     RegTy = S32;
3937   } else {
3938     unsigned EltSize = EltTy.getSizeInBits();
3939     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3940     unsigned RoundedSize = 32 * RoundedElts;
3941     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3942     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3943     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3944   }
3945 
3946   // The return type does not need adjustment.
3947   // TODO: Should we change s16 case to s32 or <2 x s16>?
3948   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3949     return true;
3950 
3951   Register Dst1Reg;
3952 
3953   // Insert after the instruction.
3954   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3955 
3956   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3957   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3958   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3959   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3960 
3961   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3962 
3963   MI.getOperand(0).setReg(NewResultReg);
3964 
3965   // In the IR, TFE is supposed to be used with a 2 element struct return
3966   // type. The intruction really returns these two values in one contiguous
3967   // register, with one additional dword beyond the loaded data. Rewrite the
3968   // return type to use a single register result.
3969 
3970   if (IsTFE) {
3971     Dst1Reg = MI.getOperand(1).getReg();
3972     if (MRI->getType(Dst1Reg) != S32)
3973       return false;
3974 
3975     // TODO: Make sure the TFE operand bit is set.
3976     MI.RemoveOperand(1);
3977 
3978     // Handle the easy case that requires no repack instructions.
3979     if (Ty == S32) {
3980       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3981       return true;
3982     }
3983   }
3984 
3985   // Now figure out how to copy the new result register back into the old
3986   // result.
3987   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3988 
3989   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3990 
3991   if (ResultNumRegs == 1) {
3992     assert(!IsTFE);
3993     ResultRegs[0] = NewResultReg;
3994   } else {
3995     // We have to repack into a new vector of some kind.
3996     for (int I = 0; I != NumDataRegs; ++I)
3997       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3998     B.buildUnmerge(ResultRegs, NewResultReg);
3999 
4000     // Drop the final TFE element to get the data part. The TFE result is
4001     // directly written to the right place already.
4002     if (IsTFE)
4003       ResultRegs.resize(NumDataRegs);
4004   }
4005 
4006   // For an s16 scalar result, we form an s32 result with a truncate regardless
4007   // of packed vs. unpacked.
4008   if (IsD16 && !Ty.isVector()) {
4009     B.buildTrunc(DstReg, ResultRegs[0]);
4010     return true;
4011   }
4012 
4013   // Avoid a build/concat_vector of 1 entry.
4014   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4015     B.buildBitcast(DstReg, ResultRegs[0]);
4016     return true;
4017   }
4018 
4019   assert(Ty.isVector());
4020 
4021   if (IsD16) {
4022     // For packed D16 results with TFE enabled, all the data components are
4023     // S32. Cast back to the expected type.
4024     //
4025     // TODO: We don't really need to use load s32 elements. We would only need one
4026     // cast for the TFE result if a multiple of v2s16 was used.
4027     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4028       for (Register &Reg : ResultRegs)
4029         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4030     } else if (ST.hasUnpackedD16VMem()) {
4031       for (Register &Reg : ResultRegs)
4032         Reg = B.buildTrunc(S16, Reg).getReg(0);
4033     }
4034   }
4035 
4036   auto padWithUndef = [&](LLT Ty, int NumElts) {
4037     if (NumElts == 0)
4038       return;
4039     Register Undef = B.buildUndef(Ty).getReg(0);
4040     for (int I = 0; I != NumElts; ++I)
4041       ResultRegs.push_back(Undef);
4042   };
4043 
4044   // Pad out any elements eliminated due to the dmask.
4045   LLT ResTy = MRI->getType(ResultRegs[0]);
4046   if (!ResTy.isVector()) {
4047     padWithUndef(ResTy, NumElts - ResultRegs.size());
4048     B.buildBuildVector(DstReg, ResultRegs);
4049     return true;
4050   }
4051 
4052   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4053   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4054 
4055   // Deal with the one annoying legal case.
4056   const LLT V3S16 = LLT::vector(3, 16);
4057   if (Ty == V3S16) {
4058     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4059     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4060     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4061     return true;
4062   }
4063 
4064   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4065   B.buildConcatVectors(DstReg, ResultRegs);
4066   return true;
4067 }
4068 
4069 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4070   MachineInstr &MI, MachineIRBuilder &B,
4071   GISelChangeObserver &Observer) const {
4072   Register Dst = MI.getOperand(0).getReg();
4073   LLT Ty = B.getMRI()->getType(Dst);
4074   unsigned Size = Ty.getSizeInBits();
4075   MachineFunction &MF = B.getMF();
4076 
4077   Observer.changingInstr(MI);
4078 
4079   // FIXME: We don't really need this intermediate instruction. The intrinsic
4080   // should be fixed to have a memory operand. Since it's readnone, we're not
4081   // allowed to add one.
4082   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4083   MI.RemoveOperand(1); // Remove intrinsic ID
4084 
4085   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4086   // TODO: Should this use datalayout alignment?
4087   const unsigned MemSize = (Size + 7) / 8;
4088   const Align MemAlign(4);
4089   MachineMemOperand *MMO = MF.getMachineMemOperand(
4090       MachinePointerInfo(),
4091       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4092           MachineMemOperand::MOInvariant,
4093       MemSize, MemAlign);
4094   MI.addMemOperand(MF, MMO);
4095 
4096   // There are no 96-bit result scalar loads, but widening to 128-bit should
4097   // always be legal. We may need to restore this to a 96-bit result if it turns
4098   // out this needs to be converted to a vector load during RegBankSelect.
4099   if (!isPowerOf2_32(Size)) {
4100     LegalizerHelper Helper(MF, *this, Observer, B);
4101 
4102     if (Ty.isVector())
4103       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4104     else
4105       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4106   }
4107 
4108   Observer.changedInstr(MI);
4109   return true;
4110 }
4111 
4112 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4113                                                 MachineRegisterInfo &MRI,
4114                                                 MachineIRBuilder &B) const {
4115   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4116   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4117       !ST.isTrapHandlerEnabled()) {
4118     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4119   } else {
4120     // Pass queue pointer to trap handler as input, and insert trap instruction
4121     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4122     const ArgDescriptor *Arg =
4123         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4124     if (!Arg)
4125       return false;
4126     MachineRegisterInfo &MRI = *B.getMRI();
4127     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4128     Register LiveIn = getLiveInRegister(
4129         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4130         /*InsertLiveInCopy=*/false);
4131     if (!loadInputValue(LiveIn, B, Arg))
4132       return false;
4133     B.buildCopy(SGPR01, LiveIn);
4134     B.buildInstr(AMDGPU::S_TRAP)
4135         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4136         .addReg(SGPR01, RegState::Implicit);
4137   }
4138 
4139   MI.eraseFromParent();
4140   return true;
4141 }
4142 
4143 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4144     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4145   // Is non-HSA path or trap-handler disabled? then, report a warning
4146   // accordingly
4147   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4148       !ST.isTrapHandlerEnabled()) {
4149     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4150                                      "debugtrap handler not supported",
4151                                      MI.getDebugLoc(), DS_Warning);
4152     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4153     Ctx.diagnose(NoTrap);
4154   } else {
4155     // Insert debug-trap instruction
4156     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4157   }
4158 
4159   MI.eraseFromParent();
4160   return true;
4161 }
4162 
4163 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4164                                             MachineInstr &MI) const {
4165   MachineIRBuilder &B = Helper.MIRBuilder;
4166   MachineRegisterInfo &MRI = *B.getMRI();
4167 
4168   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4169   auto IntrID = MI.getIntrinsicID();
4170   switch (IntrID) {
4171   case Intrinsic::amdgcn_if:
4172   case Intrinsic::amdgcn_else: {
4173     MachineInstr *Br = nullptr;
4174     MachineBasicBlock *UncondBrTarget = nullptr;
4175     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4176       const SIRegisterInfo *TRI
4177         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4178 
4179       Register Def = MI.getOperand(1).getReg();
4180       Register Use = MI.getOperand(3).getReg();
4181 
4182       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4183       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4184       if (IntrID == Intrinsic::amdgcn_if) {
4185         B.buildInstr(AMDGPU::SI_IF)
4186           .addDef(Def)
4187           .addUse(Use)
4188           .addMBB(UncondBrTarget);
4189       } else {
4190         B.buildInstr(AMDGPU::SI_ELSE)
4191           .addDef(Def)
4192           .addUse(Use)
4193           .addMBB(UncondBrTarget)
4194           .addImm(0);
4195       }
4196 
4197       if (Br) {
4198         Br->getOperand(0).setMBB(CondBrTarget);
4199       } else {
4200         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4201         // since we're swapping branch targets it needs to be reinserted.
4202         // FIXME: IRTranslator should probably not do this
4203         B.buildBr(*CondBrTarget);
4204       }
4205 
4206       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4207       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4208       MI.eraseFromParent();
4209       BrCond->eraseFromParent();
4210       return true;
4211     }
4212 
4213     return false;
4214   }
4215   case Intrinsic::amdgcn_loop: {
4216     MachineInstr *Br = nullptr;
4217     MachineBasicBlock *UncondBrTarget = nullptr;
4218     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4219       const SIRegisterInfo *TRI
4220         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4221 
4222       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4223       Register Reg = MI.getOperand(2).getReg();
4224 
4225       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4226       B.buildInstr(AMDGPU::SI_LOOP)
4227         .addUse(Reg)
4228         .addMBB(UncondBrTarget);
4229 
4230       if (Br)
4231         Br->getOperand(0).setMBB(CondBrTarget);
4232       else
4233         B.buildBr(*CondBrTarget);
4234 
4235       MI.eraseFromParent();
4236       BrCond->eraseFromParent();
4237       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4238       return true;
4239     }
4240 
4241     return false;
4242   }
4243   case Intrinsic::amdgcn_kernarg_segment_ptr:
4244     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4245       // This only makes sense to call in a kernel, so just lower to null.
4246       B.buildConstant(MI.getOperand(0).getReg(), 0);
4247       MI.eraseFromParent();
4248       return true;
4249     }
4250 
4251     return legalizePreloadedArgIntrin(
4252       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4253   case Intrinsic::amdgcn_implicitarg_ptr:
4254     return legalizeImplicitArgPtr(MI, MRI, B);
4255   case Intrinsic::amdgcn_workitem_id_x:
4256     return legalizePreloadedArgIntrin(MI, MRI, B,
4257                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4258   case Intrinsic::amdgcn_workitem_id_y:
4259     return legalizePreloadedArgIntrin(MI, MRI, B,
4260                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4261   case Intrinsic::amdgcn_workitem_id_z:
4262     return legalizePreloadedArgIntrin(MI, MRI, B,
4263                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4264   case Intrinsic::amdgcn_workgroup_id_x:
4265     return legalizePreloadedArgIntrin(MI, MRI, B,
4266                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4267   case Intrinsic::amdgcn_workgroup_id_y:
4268     return legalizePreloadedArgIntrin(MI, MRI, B,
4269                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4270   case Intrinsic::amdgcn_workgroup_id_z:
4271     return legalizePreloadedArgIntrin(MI, MRI, B,
4272                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4273   case Intrinsic::amdgcn_dispatch_ptr:
4274     return legalizePreloadedArgIntrin(MI, MRI, B,
4275                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4276   case Intrinsic::amdgcn_queue_ptr:
4277     return legalizePreloadedArgIntrin(MI, MRI, B,
4278                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4279   case Intrinsic::amdgcn_implicit_buffer_ptr:
4280     return legalizePreloadedArgIntrin(
4281       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4282   case Intrinsic::amdgcn_dispatch_id:
4283     return legalizePreloadedArgIntrin(MI, MRI, B,
4284                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4285   case Intrinsic::amdgcn_fdiv_fast:
4286     return legalizeFDIVFastIntrin(MI, MRI, B);
4287   case Intrinsic::amdgcn_is_shared:
4288     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4289   case Intrinsic::amdgcn_is_private:
4290     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4291   case Intrinsic::amdgcn_wavefrontsize: {
4292     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4293     MI.eraseFromParent();
4294     return true;
4295   }
4296   case Intrinsic::amdgcn_s_buffer_load:
4297     return legalizeSBufferLoad(MI, B, Helper.Observer);
4298   case Intrinsic::amdgcn_raw_buffer_store:
4299   case Intrinsic::amdgcn_struct_buffer_store:
4300     return legalizeBufferStore(MI, MRI, B, false, false);
4301   case Intrinsic::amdgcn_raw_buffer_store_format:
4302   case Intrinsic::amdgcn_struct_buffer_store_format:
4303     return legalizeBufferStore(MI, MRI, B, false, true);
4304   case Intrinsic::amdgcn_raw_tbuffer_store:
4305   case Intrinsic::amdgcn_struct_tbuffer_store:
4306     return legalizeBufferStore(MI, MRI, B, true, true);
4307   case Intrinsic::amdgcn_raw_buffer_load:
4308   case Intrinsic::amdgcn_struct_buffer_load:
4309     return legalizeBufferLoad(MI, MRI, B, false, false);
4310   case Intrinsic::amdgcn_raw_buffer_load_format:
4311   case Intrinsic::amdgcn_struct_buffer_load_format:
4312     return legalizeBufferLoad(MI, MRI, B, true, false);
4313   case Intrinsic::amdgcn_raw_tbuffer_load:
4314   case Intrinsic::amdgcn_struct_tbuffer_load:
4315     return legalizeBufferLoad(MI, MRI, B, true, true);
4316   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4317   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4318   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4319   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4320   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4321   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4322   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4323   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4324   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4325   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4326   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4327   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4328   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4329   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4330   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4331   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4332   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4333   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4334   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4335   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4336   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4337   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4338   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4339   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4340   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4341   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4342     return legalizeBufferAtomic(MI, B, IntrID);
4343   case Intrinsic::amdgcn_atomic_inc:
4344     return legalizeAtomicIncDec(MI, B, true);
4345   case Intrinsic::amdgcn_atomic_dec:
4346     return legalizeAtomicIncDec(MI, B, false);
4347   case Intrinsic::trap:
4348     return legalizeTrapIntrinsic(MI, MRI, B);
4349   case Intrinsic::debugtrap:
4350     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4351   default: {
4352     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4353             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4354       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4355     return true;
4356   }
4357   }
4358 
4359   return true;
4360 }
4361